diff --git a/CMakeLists.txt b/CMakeLists.txt
index bc89b6933001a05f67334cf4e1b16e3784a9b55d..d240c96768cadf33c54f7b524e63dd814754587b 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -4,6 +4,8 @@ project (dap_chain_global_db)
 file(GLOB DAP_CHAIN_GLOBAL_DB_SRC *.c)
 file(GLOB DAP_CHAIN_GLOBAL_DB_HDR *.h)
 
+add_subdirectory(libdap-cuttdb)
+
 if(WIN32)
   include_directories(../libdap/src/win32/)
   include_directories(../3rdparty/libmemcached/)
@@ -19,10 +21,10 @@ endif()
 add_library(${PROJECT_NAME} STATIC ${DAP_CHAIN_GLOBAL_DB_SRC} ${DAP_CHAIN_GLOBAL_DB_HDR})
 
 if(WIN32)
-  target_link_libraries(dap_chain_global_db dap_core dap_crypto dap_chain dap_chain_crypto)
+  target_link_libraries(dap_chain_global_db dap_core dap_crypto dap_chain dap_chain_crypto dap_cuttdb)
 endif()
 if(UNIX)
-  target_link_libraries(dap_chain_global_db dap_core dap_crypto dap_chain dap_chain_crypto ldb talloc tevent sqlite3 ${CMAKE_CURRENT_SOURCE_DIR}/libcuttdb.a)
+  target_link_libraries(dap_chain_global_db dap_core dap_crypto dap_chain dap_chain_crypto ldb talloc tevent sqlite3 dap_cuttdb)
 endif()
 
 target_include_directories(dap_chain_global_db INTERFACE .)
diff --git a/libcuttdb.a b/libcuttdb.a
deleted file mode 100644
index eb6157bf89f6a454980eafe4af9f5a92651f63fe..0000000000000000000000000000000000000000
Binary files a/libcuttdb.a and /dev/null differ
diff --git a/libdap-cuttdb/CMakeLists.txt b/libdap-cuttdb/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..4f309c1aba24d0fcb18a0f2658084ee0312894f6
--- /dev/null
+++ b/libdap-cuttdb/CMakeLists.txt
@@ -0,0 +1,28 @@
+cmake_minimum_required(VERSION 2.8)
+
+project(dap_cuttdb)
+
+add_definitions ("-D_GNU_SOURCE")
+set(CMAKE_C_FLAGS "-std=gnu11 -Wall -Wextra -fPIC")
+
+file(GLOB cuttdb_src src/*.c)
+file(GLOB cuttdb_h src/*.h)
+
+# the server part ain't ported, and thus not built, so are tests.
+list(FILTER cuttdb_src EXCLUDE REGEX "ae.")
+list(FILTER cuttdb_src EXCLUDE REGEX "server.")
+list(FILTER cuttdb_src EXCLUDE REGEX "dump.")
+list(FILTER cuttdb_src EXCLUDE REGEX "builddb.")
+list(FILTER cuttdb_src EXCLUDE REGEX "test_mt.")
+
+if(UNIX)
+    list(FILTER cuttdb_src EXCLUDE REGEX "mman.")
+    list(FILTER cuttdb_h EXCLUDE REGEX "mman.")
+endif()
+
+add_library(${PROJECT_NAME} STATIC ${cuttdb_h} ${cuttdb_src})
+
+target_link_libraries(${PROJECT_NAME} -lpthread)
+
+target_include_directories(${PROJECT_NAME} INTERFACE src)
+
diff --git a/libdap-cuttdb/src/ae_epoll.c b/libdap-cuttdb/src/ae_epoll.c
new file mode 100644
index 0000000000000000000000000000000000000000..ff8591d86c3d7962c1b01e6ad51b09144ba74f04
--- /dev/null
+++ b/libdap-cuttdb/src/ae_epoll.c
@@ -0,0 +1,109 @@
+/* Linux epoll(2) based ae.c module
+ * Copyright (C) 2009-2010 Salvatore Sanfilippo - antirez@gmail.com
+ * Released under the BSD license. See the COPYING file for more info. */
+
+#include <sys/epoll.h>
+#include <errno.h>
+
+typedef struct aeApiState {
+    int epfd;
+    struct epoll_event events[AE_SETSIZE];
+} aeApiState;
+
+static int aeApiCreate(EventLoop *eventLoop) {
+    aeApiState *state = malloc(sizeof(aeApiState));
+
+    if (!state) return -1;
+    state->epfd = epoll_create(1024); /* 1024 is just an hint for the kernel */
+    if (state->epfd == -1) return -1;
+    eventLoop->apidata = state;
+    return 0;
+}
+
+/*
+    be not referenced anywhere
+static void aeApiFree(EventLoop *eventLoop) {
+    aeApiState *state = eventLoop->apidata;
+
+    close(state->epfd);
+    free(state);
+}
+*/
+
+static int aeApiAddEvent(EventLoop *eventLoop, int fd, int mask) {
+    aeApiState *state = eventLoop->apidata;
+    struct epoll_event ee;
+    ee.events = EPOLLONESHOT;
+    if (mask & AE_READABLE) ee.events |= EPOLLIN;
+    if (mask & AE_WRITABLE) ee.events |= EPOLLOUT;
+    ee.data.u64 = 0; /* avoid valgrind warning */
+    ee.data.fd = fd;
+    if (epoll_ctl(state->epfd, EPOLL_CTL_ADD,fd,&ee) == -1 && errno != EEXIST) {
+        fprintf(stderr, "epoll_ctl(%d,%d) failed: %d\n", EPOLL_CTL_ADD,fd,errno);
+        return -1;
+    }
+    return 0;
+}
+
+static int aeApiUpdateEvent(EventLoop *eventLoop, int fd, int mask) {
+    aeApiState *state = eventLoop->apidata;
+    struct epoll_event ee;
+    ee.events = EPOLLONESHOT;
+    if (mask & AE_READABLE) ee.events |= EPOLLIN;
+    if (mask & AE_WRITABLE) ee.events |= EPOLLOUT;
+    ee.data.u64 = 0; /* avoid valgrind warning */
+    ee.data.fd = fd;
+    if (epoll_ctl(state->epfd, EPOLL_CTL_MOD,fd,&ee) == -1) {
+        fprintf(stderr, "epoll_ctl(%d,%d) failed: %d\n", EPOLL_CTL_ADD,fd,errno);
+        return -1;
+    }
+    return 0;
+}
+
+static int aeApiDelEvent(EventLoop *eventLoop, int fd) {
+    aeApiState *state = eventLoop->apidata;
+    struct epoll_event ee;
+
+    ee.events = 0;
+    ee.data.u64 = 0; /* avoid valgrind warning */
+    ee.data.fd = fd;
+    /* Note, Kernel < 2.6.9 requires a non null event pointer even for
+     * EPOLL_CTL_DEL. */
+    if ( epoll_ctl(state->epfd,EPOLL_CTL_DEL,fd,&ee) == -1 
+            && errno != ENOENT && errno != EBADF) {
+        fprintf(stderr, "epoll_ctl(%d,%d) failed: %d\n", EPOLL_CTL_DEL,fd,errno);
+        return -1;
+    }
+    return 0;
+}
+
+int aeApiPoll(EventLoop *eventLoop, struct timeval *tvp) {
+    aeApiState *state = eventLoop->apidata;
+    int retval, numevents = 0;
+
+    retval = epoll_wait(state->epfd,state->events,AE_SETSIZE,
+            tvp ? (tvp->tv_sec*1000 + tvp->tv_usec/1000) : -1);
+    if (retval > 0) {
+        int j;
+
+        numevents = retval;
+        for (j = 0; j < numevents; j++) {
+            int mask = 0;
+            struct epoll_event *e = state->events+j;
+
+            if (e->events & EPOLLIN) mask |= AE_READABLE;
+            if (e->events & EPOLLOUT) mask |= AE_WRITABLE;
+            eventLoop->fired[j] = e->data.fd;
+        }
+    }
+    return numevents;
+}
+
+
+/*
+    be not referenced anywhere
+static char *aeApiName(void) {
+    return "epoll";
+}
+*/
+
diff --git a/libdap-cuttdb/src/ae_kqueue.c b/libdap-cuttdb/src/ae_kqueue.c
new file mode 100644
index 0000000000000000000000000000000000000000..cd80a57be2d19d485f2a2ce3485b42494ba43640
--- /dev/null
+++ b/libdap-cuttdb/src/ae_kqueue.c
@@ -0,0 +1,91 @@
+/* Kqueue(2)-based ae.c module
+ * Copyright (C) 2009 Harish Mallipeddi - harish.mallipeddi@gmail.com
+ * Released under the BSD license. See the COPYING file for more info. */
+
+#include <sys/types.h>
+#include <sys/event.h>
+#include <sys/time.h>
+
+typedef struct aeApiState {
+    int kqfd;
+    struct kevent events[AE_SETSIZE];
+} aeApiState;
+
+static int aeApiCreate(EventLoop *eventLoop) {
+    aeApiState *state = malloc(sizeof(aeApiState));
+
+    if (!state) return -1;
+    state->kqfd = kqueue();
+    if (state->kqfd == -1) return -1;
+    eventLoop->apidata = state;
+    
+    return 0;    
+}
+
+static void aeApiFree(EventLoop *eventLoop) {
+    aeApiState *state = eventLoop->apidata;
+
+    close(state->kqfd);
+    free(state);
+}
+
+static int aeApiAddEvent(EventLoop *eventLoop, int fd, int mask) {
+    aeApiState *state = eventLoop->apidata;
+    struct kevent ke;
+    
+    if (mask & AE_READABLE) {
+        EV_SET(&ke, fd, EVFILT_READ, EV_ADD, 0, 0, NULL);
+        if (kevent(state->kqfd, &ke, 1, NULL, 0, NULL) == -1) return -1;
+    }
+    if (mask & AE_WRITABLE) {
+        EV_SET(&ke, fd, EVFILT_WRITE, EV_ADD, 0, 0, NULL);
+        if (kevent(state->kqfd, &ke, 1, NULL, 0, NULL) == -1) return -1;
+    }
+    return 0;
+}
+
+static int aeApiUpdateEvent(EventLoop *eventLoop, int fd, int mask) {
+    return aeApiAddEvent(eventLoop, fd, mask);
+}
+
+static int aeApiDelEvent(EventLoop *eventLoop, int fd) {
+    aeApiState *state = eventLoop->apidata;
+    struct kevent ke;
+
+    EV_SET(&ke, fd, EVFILT_READ | EVFILT_WRITE, EV_DELETE, 0, 0, NULL);
+    kevent(state->kqfd, &ke, 1, NULL, 0, NULL);
+    return 0;
+}
+
+static int aeApiPoll(EventLoop *eventLoop, struct timeval *tvp) {
+    aeApiState *state = eventLoop->apidata;
+    int retval, numevents = 0;
+
+    if (tvp != NULL) {
+        struct timespec timeout;
+        timeout.tv_sec = tvp->tv_sec;
+        timeout.tv_nsec = tvp->tv_usec * 1000;
+        retval = kevent(state->kqfd, NULL, 0, state->events, AE_SETSIZE, &timeout);
+    } else {
+        retval = kevent(state->kqfd, NULL, 0, state->events, AE_SETSIZE, NULL);
+    }    
+
+    if (retval > 0) {
+        int j;
+        
+        numevents = retval;
+        for(j = 0; j < numevents; j++) {
+            int mask = 0;
+            struct kevent *e = state->events+j;
+            
+            if (e->filter == EVFILT_READ) mask |= AE_READABLE;
+            if (e->filter == EVFILT_WRITE) mask |= AE_WRITABLE;
+            eventLoop->fired[j] = e->ident; 
+        }
+    }
+    return numevents;
+}
+
+static char *aeApiName(void) {
+    return "kqueue";
+}
diff --git a/libdap-cuttdb/src/ae_select.c b/libdap-cuttdb/src/ae_select.c
new file mode 100644
index 0000000000000000000000000000000000000000..1e5d3ae91aa886a4b086ff07c28b9e10045ea292
--- /dev/null
+++ b/libdap-cuttdb/src/ae_select.c
@@ -0,0 +1,72 @@
+/* Select()-based ae.c module
+ * Copyright (C) 2009-2010 Salvatore Sanfilippo - antirez@gmail.com
+ * Released under the BSD license. See the COPYING file for more info. */
+
+#include <string.h>
+
+typedef struct aeApiState {
+    fd_set rfds, wfds;
+    /* We need to have a copy of the fd sets as it's not safe to reuse
+     * FD sets after select(). */
+    fd_set _rfds, _wfds;
+} aeApiState;
+
+static int aeApiCreate(EventLoop *eventLoop) {
+    aeApiState *state = malloc(sizeof(aeApiState));
+
+    if (!state) return -1;
+    FD_ZERO(&state->rfds);
+    FD_ZERO(&state->wfds);
+    eventLoop->apidata = state;
+    return 0;
+}
+
+static void aeApiFree(EventLoop *eventLoop) {
+    free(eventLoop->apidata);
+}
+
+static int aeApiAddEvent(EventLoop *eventLoop, int fd, int mask) {
+    aeApiState *state = eventLoop->apidata;
+
+    if (mask & AE_READABLE) FD_SET(fd,&state->rfds);
+    if (mask & AE_WRITABLE) FD_SET(fd,&state->wfds);
+    return 0;
+}
+
+static void aeApiDelEvent(EventLoop *eventLoop, int fd, int mask) {
+    aeApiState *state = eventLoop->apidata;
+
+    if (mask & AE_READABLE) FD_CLR(fd,&state->rfds);
+    if (mask & AE_WRITABLE) FD_CLR(fd,&state->wfds);
+}
+
+static int aeApiPoll(EventLoop *eventLoop, struct timeval *tvp) {
+    aeApiState *state = eventLoop->apidata;
+    int retval, j, numevents = 0;
+
+    memcpy(&state->_rfds,&state->rfds,sizeof(fd_set));
+    memcpy(&state->_wfds,&state->wfds,sizeof(fd_set));
+
+    retval = select(eventLoop->maxfd+1,
+                &state->_rfds,&state->_wfds,NULL,tvp);
+    if (retval > 0) {
+        for (j = 0; j <= eventLoop->maxfd; j++) {
+            int mask = 0;
+            aeFileEvent *fe = &eventLoop->events[j];
+
+            if (fe->mask == AE_NONE) continue;
+            if (fe->mask & AE_READABLE && FD_ISSET(j,&state->_rfds))
+                mask |= AE_READABLE;
+            if (fe->mask & AE_WRITABLE && FD_ISSET(j,&state->_wfds))
+                mask |= AE_WRITABLE;
+            eventLoop->fired[numevents].fd = j;
+            eventLoop->fired[numevents].mask = mask;
+            numevents++;
+        }
+    }
+    return numevents;
+}
+
+static char *aeApiName(void) {
+    return "select";
+}
diff --git a/libdap-cuttdb/src/cdb_bgtask.c b/libdap-cuttdb/src/cdb_bgtask.c
new file mode 100644
index 0000000000000000000000000000000000000000..822c02c1299c1f03fd5b738f50183e474dffce0a
--- /dev/null
+++ b/libdap-cuttdb/src/cdb_bgtask.c
@@ -0,0 +1,128 @@
+/*
+ *   CuttDB - a fast key-value storage engine
+ *
+ *
+ *   http://code.google.com/p/cuttdb/
+ *   
+ *   Copyright (c) 2012, Siyuan Fu.  All rights reserved.
+ *   Use and distribution licensed under the BSD license. 
+ *   See the LICENSE file for full text
+ *
+ *   Author: Siyuan Fu <fusiyuan2010@gmail.com>
+ *
+ */
+
+
+#include "cdb_bgtask.h"
+#include <stdlib.h>
+#ifndef _WIN32
+#include <sys/signal.h>
+#else
+#include <signal.h>
+#endif
+
+
+/* where thread begins */
+static void *_cdb_bgtask_func(void *arg);
+
+
+CDBBGTASK *cdb_bgtask_new()
+{
+    CDBBGTASK *bt = (CDBBGTASK *)malloc(sizeof(CDBBGTASK));
+
+    bt->tnum = 0;
+    bt->run = 0;
+    bt->tid = 0;
+    pthread_cond_init(&bt->scond, NULL);
+    pthread_mutex_init(&bt->smutex, NULL);
+    return bt;
+}
+
+
+/* add a task into task list, must called before the thread run */
+int cdb_bgtask_add(CDBBGTASK *bt, TASKFUNC func, void *arg, int intval)
+{
+    TASK *task = &bt->tasks[bt->tnum];
+
+    if (bt->tid || bt->tnum > MAXTASKNUM)
+        return -1;
+
+    task->arg = arg;
+    task->func = func;
+    task->intval = intval;
+    task->ltime = time(NULL);
+    bt->tnum++;
+    return 0;
+}
+
+
+static void *_cdb_bgtask_func(void *arg)
+{
+    CDBBGTASK *bt = (CDBBGTASK *)arg;
+#ifndef _WIN32
+    /* block all signals coming into current thread */
+    _sigset_t smask;
+    sigfillset(&smask);
+    pthread_sigmask(SIG_BLOCK, &smask, NULL);
+#endif
+    /* loop */
+    while(bt->run) {
+        time_t now = time(NULL);
+        struct timespec timeout;
+
+        /* check should run some tasks every 1 second */
+        timeout.tv_sec = now + 1;
+        timeout.tv_nsec = 0;
+
+        /* iterate and run the tasks */
+        for(int i = 0; i < bt->tnum; i++) {
+            TASK *task = &bt->tasks[i];
+            if (now >= task->ltime + task->intval) {
+                task->func(task->arg);
+                task->ltime = now;
+            }
+        }
+        pthread_cond_timedwait(&bt->scond, &bt->smutex, &timeout);
+    }
+
+    return NULL;
+}
+
+
+/* create a thread for tasks */
+void cdb_bgtask_start(CDBBGTASK *bt)
+{
+    if (bt->run)
+        return;
+
+    bt->run = 1;
+    pthread_create(&bt->tid, NULL, _cdb_bgtask_func, bt);
+    return;
+}
+
+
+/* wait for the task thread exits */
+void cdb_bgtask_stop(CDBBGTASK *bt)
+{
+    if (bt->run) {
+        void **ret = NULL;
+        bt->run = 0;
+        pthread_cond_signal(&bt->scond);
+        pthread_join(bt->tid, ret);
+    }
+
+    bt->tnum = 0;
+}
+
+
+void cdb_bgtask_destroy(CDBBGTASK *bt)
+{
+    cdb_bgtask_stop(bt);
+    pthread_cond_destroy(&bt->scond);
+    pthread_mutex_destroy(&bt->smutex);
+    free(bt);
+}
+
+
+
+
diff --git a/libdap-cuttdb/src/cdb_bgtask.h b/libdap-cuttdb/src/cdb_bgtask.h
new file mode 100644
index 0000000000000000000000000000000000000000..6dee1b992d21bac8a496e7f8f89431c570b9358a
--- /dev/null
+++ b/libdap-cuttdb/src/cdb_bgtask.h
@@ -0,0 +1,62 @@
+/*
+ *   CuttDB - a fast key-value storage engine
+ *
+ *
+ *   http://code.google.com/p/cuttdb/
+ *   
+ *   Copyright (c) 2012, Siyuan Fu.  All rights reserved.
+ *   Use and distribution licensed under the BSD license. 
+ *   See the LICENSE file for full text
+ *
+ *   Author: Siyuan Fu <fusiyuan2010@gmail.com>
+ *
+ */
+
+
+#ifndef _CDB_BGTASK_H_
+#define _CDB_BGTASK_H_
+#include <time.h>
+#include <pthread.h>
+
+
+/* 16 tasks at most in a task thread */
+#define MAXTASKNUM 16
+
+typedef void (*TASKFUNC)(void *);
+
+/* struct for timer task */
+typedef struct {
+    /* task function */
+    TASKFUNC func;
+    /* task argument */
+    void *arg;
+    /* task run interval(seconds) */
+    int intval;
+    /* time of last run */
+    time_t ltime;
+} TASK;
+
+/* struct for a background task manager */
+typedef struct CDBBGTASK
+{
+    TASK tasks[MAXTASKNUM];
+    /* number of tasks */
+    int tnum;
+    /* is running? */
+    int run;
+    pthread_t tid;
+    /* for wait the thread exit */
+    pthread_mutex_t smutex;
+    pthread_cond_t scond;
+} CDBBGTASK;
+
+
+
+CDBBGTASK *cdb_bgtask_new();
+int cdb_bgtask_add(CDBBGTASK *task, TASKFUNC func, void *arg, int intval);
+void cdb_bgtask_start(CDBBGTASK *bt);
+void cdb_bgtask_stop(CDBBGTASK *task);
+void cdb_bgtask_destroy(CDBBGTASK *task);
+
+
+#endif
diff --git a/libdap-cuttdb/src/cdb_bloomfilter.c b/libdap-cuttdb/src/cdb_bloomfilter.c
new file mode 100644
index 0000000000000000000000000000000000000000..ebf5e2d3e8b508a20985b76bd6a1e974a89c9daf
--- /dev/null
+++ b/libdap-cuttdb/src/cdb_bloomfilter.c
@@ -0,0 +1,158 @@
+/*
+ *   CuttDB - a fast key-value storage engine
+ *
+ *
+ *   http://code.google.com/p/cuttdb/
+ *   
+ *   Copyright (c) 2012, Siyuan Fu.  All rights reserved.
+ *   Use and distribution licensed under the BSD license. 
+ *   See the LICENSE file for full text
+ *
+ *   Author: Siyuan Fu <fusiyuan2010@gmail.com>
+ *
+ */
+
+
+#include "cdb_bloomfilter.h"
+#include <stdlib.h>
+#include <string.h>
+
+#define CDBBFHASHNUM 16
+#define CDBBFSPLITPOW 6
+
+static uint64_t BFSEEDS[CDBBFHASHNUM] = {217636919,290182597,386910137,515880193,
+                                        687840301,917120411,1222827239,1610612741,
+                                        3300450239,3300450259,3300450281,3300450289,
+                                        3221225473ul,4294967291ul,163227661,122420729,};
+
+struct CDBBLOOMFILTER
+{
+    uint8_t *bitmap[1<<CDBBFSPLITPOW];
+    uint64_t rnum;
+    uint64_t size;
+    int hnum;
+    int ratio;
+};
+
+
+CDBBLOOMFILTER *cdb_bf_new(uint64_t rnum, uint64_t size)
+{
+    CDBBLOOMFILTER *bf = (CDBBLOOMFILTER *)malloc(sizeof(CDBBLOOMFILTER));
+    bf->rnum = 0;
+    bf->size = size;
+    /* number of hash should be 0.7 * ratio */
+    bf->hnum = size * 8 * 7 / (rnum * 10);
+    /* number of hash is limit in [1, 16] */
+    if (bf->hnum > CDBBFHASHNUM)
+        bf->hnum = CDBBFHASHNUM;
+    if (bf->hnum == 0)
+        bf->hnum = 1;
+    /* avoid malloc too much memory once */
+    for(int i = 0; i < (1 << CDBBFSPLITPOW); i++) {
+        bf->bitmap[i] = (uint8_t*)malloc(size >> CDBBFSPLITPOW);
+        memset(bf->bitmap[i], 0, size >> CDBBFSPLITPOW);
+    }
+    return bf;
+}
+
+
+void cdb_bf_set(CDBBLOOMFILTER *bf, void *key, int ksize)
+{
+    uint8_t *src = (uint8_t *)key, *end = src + ksize;
+    uint64_t hval[CDBBFHASHNUM] = {0};
+
+    for(;src < end; src++) 
+        for(int i = 0; i < bf->hnum; i++) 
+            hval[i] = hval[i] * BFSEEDS[i] + *src;
+
+    for(int i = 0; i < bf->hnum; i++) {
+        uint64_t p = (hval[i] >> CDBBFSPLITPOW) % ((bf->size >> CDBBFSPLITPOW) << 3);
+        uint8_t *bitmap = bf->bitmap[hval[i] & ((1<<CDBBFSPLITPOW) - 1)];
+        bitmap[p >> 3] |= (1 << (p & 0x07));
+    }
+
+    bf->rnum++;
+}
+
+
+bool cdb_bf_exist(CDBBLOOMFILTER *bf, void *key, int ksize)
+{
+    uint8_t *src = (uint8_t *)key, *end = src + ksize;
+    uint64_t hval[CDBBFHASHNUM] = {0};
+    int exist = 0;
+
+    for(;src < end; src++) 
+        for(int i = 0; i < bf->hnum; i++) 
+            hval[i] = hval[i] * BFSEEDS[i] + *src;
+
+    for(int i = 0; i < bf->hnum; i++) {
+        uint64_t p = (hval[i] >> CDBBFSPLITPOW) % ((bf->size >> CDBBFSPLITPOW) << 3);
+        uint8_t *bitmap = bf->bitmap[hval[i] & ((1<<CDBBFSPLITPOW) - 1)];
+        if (bitmap[p >> 3] & (1 << (p & 0x07)))
+            exist++;
+        else 
+            break;
+    }
+
+    return (exist == bf->hnum);
+}
+
+void cdb_bf_clean(CDBBLOOMFILTER *bf)
+{
+    for(int i = 0; i < (1 << CDBBFSPLITPOW); i++) 
+        memset(bf->bitmap[i], 0, bf->size >> CDBBFSPLITPOW);
+
+    bf->rnum = 0;
+}
+
+
+void cdb_bf_destroy(CDBBLOOMFILTER *bf)
+{
+    for(int i = 0; i < (1 << CDBBFSPLITPOW); i++) 
+        free(bf->bitmap[i]);
+    free(bf);
+}
+
+
+#ifdef _UT_CDBBF_
+#include <stdio.h>
+#include <stdlib.h>
+#include "cdb_bloomfilter.h"
+
+int main(int argc, char *argv[])
+{
+    int size = 1048576;
+    int rnum = 1048576;
+    if (argc > 1)
+        rnum = atoi(argv[1]);
+    if (argc > 2)
+        size = atoi(argv[2]);
+
+    CDBBLOOMFILTER *bf = cdb_bf_new(rnum, size);
+    for(int i = 0; i < rnum; i++) {
+        int j = 2 * i;
+        cdb_bf_set(bf, &j, 4);
+    }
+
+    int exist = 0;
+    for(int i = 0; i < rnum; i++) {
+        int j = 2 * i;
+        if (cdb_bf_exist(bf, &j, 4))
+            exist++;
+    }
+    printf("right positive: %.2f%%%%\n", (float)exist/(float)rnum*10000);
+
+    exist = 0;
+    for(int i = 0; i < rnum * 2; i++) {
+        int j = 2 * i + 1;
+        if (cdb_bf_exist(bf, &j, 4))
+            exist++;
+    }
+
+    printf("false positive: %.2f%%%%  %d/%d\n", (float)exist/(float)rnum*5000, exist, rnum * 2);
+    printf("element num: %d\n", bf->rnum);
+    cdb_bf_destroy(bf);
+    return 0;
+}
+#endif
+
diff --git a/libdap-cuttdb/src/cdb_bloomfilter.h b/libdap-cuttdb/src/cdb_bloomfilter.h
new file mode 100644
index 0000000000000000000000000000000000000000..6ccdab1fccc1d92d843f072046550741a5cdaf37
--- /dev/null
+++ b/libdap-cuttdb/src/cdb_bloomfilter.h
@@ -0,0 +1,34 @@
+/*
+ *   CuttDB - a fast key-value storage engine
+ *
+ *
+ *   http://code.google.com/p/cuttdb/
+ *   
+ *   Copyright (c) 2012, Siyuan Fu.  All rights reserved.
+ *   Use and distribution licensed under the BSD license. 
+ *   See the LICENSE file for full text
+ *
+ *   Author: Siyuan Fu <fusiyuan2010@gmail.com>
+ *
+ */
+
+
+/*
+Bloom Filter is currently not used in cuttdb
+*/
+#ifndef _CDB_BLOOMFILTER_H_
+#define _CDB_BLOOMFILTER_H_
+#include <stdbool.h>
+#include <stdint.h>
+
+typedef struct CDBBLOOMFILTER CDBBLOOMFILTER;
+
+#define CDBBFRATIO 8
+
+CDBBLOOMFILTER *cdb_bf_new(uint64_t rnum, uint64_t size);
+void cdb_bf_set(CDBBLOOMFILTER *bf, void *key, int ksize);
+bool cdb_bf_exist(CDBBLOOMFILTER *bf, void *key, int ksize);
+void cdb_bf_clean(CDBBLOOMFILTER *bf);
+void cdb_bf_destroy(CDBBLOOMFILTER *bf);
+
+#endif
diff --git a/libdap-cuttdb/src/cdb_builddb.c b/libdap-cuttdb/src/cdb_builddb.c
new file mode 100644
index 0000000000000000000000000000000000000000..fc5f18dc68c9dbd78de89441d88ada791a8b97a5
--- /dev/null
+++ b/libdap-cuttdb/src/cdb_builddb.c
@@ -0,0 +1,72 @@
+/*
+ *   CuttDB - a fast key-value storage engine
+ *
+ *
+ *   http://code.google.com/p/cuttdb/
+ *   
+ *   Copyright (c) 2012, Siyuan Fu.  All rights reserved.
+ *   Use and distribution licensed under the BSD license. 
+ *   See the LICENSE file for full text
+ *
+ *   Author: Siyuan Fu <fusiyuan2010@gmail.com>
+ *
+ */
+
+#include "cuttdb.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <time.h>
+
+int main(int argc, char *argv[])
+{
+    CDB *db = cdb_new();
+    if (argc < 2) {
+        fprintf(stderr, "Usage: %s db_path [hsize = 2000000]\n", argv[0]);
+        return 0;
+    }
+        
+    /* 1TB memory limit(unlimited) */
+    cdb_option(db, argc >= 3? atoi(argv[2]):2000000 , 0, 1048576);
+    cdb_seterrcb(db, cdb_deferrorcb, NULL);
+    if (cdb_open(db, argv[1], CDB_CREAT | CDB_PAGEWARMUP) < 0) {
+        return -1;
+    }
+    char *buf = NULL;
+    long count = 0;
+
+    size_t size, size2;
+    while((size = getline(&buf, &size2, stdin)) != -1) {
+        /* remove the delimiter*/
+        buf[--size] = '\0';
+        int klen = -1;
+        int vlen = -1;
+        uint32_t expire = 0;
+        int parsenum = 0;
+        for(int i = 0; i < size; i++) {
+            if (buf[i] == '\t') {
+                if (klen == -1)
+                    klen = i;
+                else {
+                    vlen = i - klen - 1;
+                    parsenum = 1;
+                }
+            } else if (buf[i] >= '0' && buf[i] <= '9' && parsenum) {
+                expire = expire * 10 + buf[i] - '0';
+            }
+        }
+
+        if (klen > 0 && vlen > 0) {
+            cdb_set2(db, buf, klen, buf + klen + 1, vlen,
+                    CDB_OVERWRITE, expire > 0? expire - time(NULL): 0);
+            count++;
+        }
+        free(buf);
+        buf = NULL;
+    }
+    cdb_destroy(db);
+    fprintf(stderr, "imported %ld records\n", count);
+    return 0;
+}
+
+
diff --git a/libdap-cuttdb/src/cdb_core.c b/libdap-cuttdb/src/cdb_core.c
new file mode 100644
index 0000000000000000000000000000000000000000..79356a0f2b25659da3e71e12e2e670af99b5d075
--- /dev/null
+++ b/libdap-cuttdb/src/cdb_core.c
@@ -0,0 +1,1396 @@
+/*
+ *   CuttDB - a fast key-value storage engine
+ *
+ *
+ *   http://code.google.com/p/cuttdb/
+ *   
+ *   Copyright (c) 2012, Siyuan Fu.  All rights reserved.
+ *   Use and distribution licensed under the BSD license. 
+ *   See the LICENSE file for full text
+ *
+ *   Author: Siyuan Fu <fusiyuan2010@gmail.com>
+ *
+ */
+
+
+#include "cuttdb.h"
+#include "cdb_crc64.h"
+#include "cdb_types.h"
+#include "cdb_hashtable.h"
+#include "cdb_bloomfilter.h"
+#include "cdb_lock.h"
+#include "cdb_bgtask.h"
+#include "cdb_errno.h"
+#include "cdb_vio.h"
+#include "cdb_core.h"
+#include <stdio.h>
+#include <string.h>
+#include <time.h>
+#include <pthread.h>
+
+static void _cdb_pageout(CDB *db);
+static void _cdb_defparam(CDB *db);
+static void _cdb_recout(CDB *db);
+static uint32_t _pagehash(const void *key, int len);
+static void _cdb_flushdpagetask(void *arg);
+static void _cdb_timerreset(struct timespec *ts);
+static uint32_t _cdb_timermicrosec(struct timespec *ts);
+static void _cdb_pagewarmup(CDB *db, bool loadbf);
+
+
+/* it isn't necessary to rehash bid in hash table cache */
+static uint32_t _pagehash(const void *key, int len)
+{
+    return *(uint32_t*)key;
+}
+
+
+/* used to get the duration of a procedure */
+static void _cdb_timerreset(struct timespec *ts)
+{
+    clock_gettime(CLOCK_MONOTONIC, ts);
+}
+
+
+static uint32_t _cdb_timermicrosec(struct timespec *ts)
+{
+    struct timespec ts2;
+    uint32_t diff;
+    clock_gettime(CLOCK_MONOTONIC, &ts2);
+    diff = (ts2.tv_sec - ts->tv_sec) * 1000000;
+    diff += ts2.tv_nsec / 1000;
+    diff -= ts->tv_nsec / 1000;
+    return diff;
+}
+
+
+/* reset the parameters */
+static void _cdb_defparam(CDB *db)
+{
+    db->rnum = 0;
+    db->bfsize = 0;
+    db->rclimit = 128 * MB;
+    db->pclimit = 1024 * MB;
+    db->hsize = 1000000; 
+    db->rcache = db->pcache = db->dpcache = NULL;
+    db->bf = NULL;
+    db->opened = false;
+    db->vio = NULL;
+    db->mtable = NULL;
+    db->oid = 0;
+    db->roid = 0;
+    db->errcbarg = NULL;
+    db->errcb = NULL;
+    db->areadsize = 4 * KB;
+    return;
+}
+
+
+/* flush all dirty pages */
+void cdb_flushalldpage(CDB *db)
+{
+    if (db->dpcache) {
+        while (db->dpcache->num) {
+            CDBHTITEM *item = cdb_ht_poptail(db->dpcache);    
+            uint32_t bid = *(uint32_t*)cdb_ht_itemkey(db->dpcache, item);
+            FOFF off;
+            db->vio->wpage(db->vio, (CDBPAGE*)cdb_ht_itemval(db->dpcache, item), &off);
+            db->mtable[bid] = off;
+            free(item);
+        } 
+
+        db->roid = db->oid; 
+        db->vio->cleanpoint(db->vio);
+    }
+}
+
+
+/* flush oldest dirty index page to disk, it runs in another thread and triggered by timer */
+static void _cdb_flushdpagetask(void *arg)
+{
+    CDB *db = (CDB *)arg;
+    CDBHTITEM *item;
+    CDBPAGE *page;
+    time_t now = time(NULL);
+    bool cleandcache = false;
+    uint32_t bid;
+
+    if (!db->dpcache)
+        /* no dirty page cache */
+        return;
+
+    /* if there isn't too much dirty page and some time passed since last clean,
+     write out all dirty pages to make a recovery point(oid) */
+    if (db->dpcache->num < 1024 && now > db->ndpltime + 120)
+        cleandcache = true;
+        
+    while(db->dpcache->num) {
+        FOFF off;
+        cdb_lock_lock(db->dpclock);
+        item = cdb_ht_gettail(db->dpcache);
+        /* no item in dpcache after lock */
+        if (item == NULL) {
+            cdb_lock_unlock(db->dpclock);
+            return;
+        }
+        page = (CDBPAGE *)cdb_ht_itemval(db->dpcache, item);
+        /* bid = page->bid; also OK */
+        bid = *(uint32_t*)cdb_ht_itemkey(db->dpcache, item);
+        /* been dirty for too long? */
+        if (now > page->mtime + DPAGETIMEOUT || cleandcache) {
+            if (cdb_lock_trylock(db->mlock[page->bid % MLOCKNUM])) {
+                /* avoid dead lock, since dpclock is holding */
+                cdb_lock_unlock(db->dpclock);
+                return;
+            }
+            /* remove it from dpcache */
+            cdb_ht_poptail(db->dpcache);
+            cdb_lock_unlock(db->dpclock);
+
+            /* write to disk */
+            struct timespec ts;
+            _cdb_timerreset(&ts);
+            db->vio->wpage(db->vio, page, &off);
+            db->wcount++;
+            db->wtime += _cdb_timermicrosec(&ts);
+            db->mtable[bid] = off;
+
+            /* move the clean page into pcache */
+            cdb_lock_lock(db->pclock);
+            cdb_ht_insert(db->pcache, item);
+            cdb_lock_unlock(db->pclock);
+            cdb_lock_unlock(db->mlock[bid % MLOCKNUM]);
+        } else {
+            /* tail in dpcache isn't expired */
+            cdb_lock_unlock(db->dpclock);
+            return;
+        }
+    }
+
+    if (db->dpcache->num == 0 && cleandcache)
+        db->ndpltime = now;
+
+    if (cleandcache) {
+        /* clean succeed if goes here, remember the recovery point */
+        /* it's not necessary to lock */
+        db->roid = db->oid; 
+        db->vio->cleanpoint(db->vio);
+    }
+}
+
+
+/* fill the index page cache, and set the bloomfilter if necessary */
+static void _cdb_pagewarmup(CDB *db, bool loadbf)
+{
+    char sbuf[SBUFSIZE];
+    void *it = db->vio->pageitfirst(db->vio, 0);
+
+    if (it == NULL)
+        return;
+
+    for(;;) {
+        CDBPAGE *page = (CDBPAGE *)sbuf;
+        if (db->vio->pageitnext(db->vio, &page, it) < 0)
+            break;
+
+        /* the page is the newest one because its offset matches the one in main table */
+        if (OFFEQ(page->ooff, db->mtable[page->bid])) {
+            if (loadbf) {
+                /* iterate key hashes in page, set to the filter */
+                cdb_lock_lock(db->bflock);
+                for(uint32_t i = 0; i < page->num; i++) {
+                    uint64_t hash = (page->bid << 24) | (page->items[i].hash.i2 << 8)
+                        | (page->items[i].hash.i1);
+                    /* bloom filter use the combined record hash as key */
+                    cdb_bf_set(db->bf, &hash, SI8);
+                }
+                cdb_lock_unlock(db->bflock);
+            }
+
+            /* set the page to pcache if it doesn't exceed the limit size */
+            if (db->pcache && db->pcache->size < db->pclimit) {
+                cdb_lock_lock(db->pclock);
+                cdb_ht_insert2(db->pcache, &page->bid, SI4, page, MPAGESIZE(page));
+                cdb_lock_unlock(db->pclock);
+            }
+        }
+        /* the page may not be still in stack */
+        if (page != (CDBPAGE *)sbuf)
+            free(page);
+
+        if (!loadbf && (db->pcache && db->pcache->size > db->pclimit))
+            break;
+    }
+
+    db->vio->pageitdestroy(db->vio, it);
+}
+
+
+/* generate an incremental global operation id */
+uint64_t cdb_genoid(CDB *db)
+{
+    uint64_t oid;
+    cdb_lock_lock(db->oidlock);
+    oid = db->oid++;
+    cdb_lock_unlock(db->oidlock);
+    return oid;
+}
+
+
+/* get a new record iterator */
+void *cdb_iterate_new(CDB *db, uint64_t oid)
+{
+    return db->vio->recitfirst(db->vio, oid);
+}
+
+
+
+/* iterate the database by callback */
+uint64_t cdb_iterate(CDB *db, CDB_ITERCALLBACK itcb, void *arg, void *iter)
+{
+    char sbuf[SBUFSIZE];
+    uint64_t cnt = 0;
+
+    if (iter == NULL)
+        return cnt;
+    for(;;) {
+        /* the rec is a copy from file, may in stack or allocated in heap */
+        CDBREC *rec = (CDBREC *)sbuf;
+        bool ret = true;
+        if (db->vio->recitnext(db->vio, &rec, iter) < 0) 
+            break;
+        
+        if (cdb_checkoff(db, CDBHASH64(rec->key, rec->ksize), rec->ooff, CDB_NOTLOCKED)) {
+            ret = itcb(arg, rec->key, rec->ksize, rec->val, rec->vsize, rec->expire, rec->oid);
+            cnt++;
+        }
+        if (rec != (CDBREC *)sbuf)
+            free(rec);
+        if (!ret) 
+            break;
+    }
+    return cnt;
+}
+
+
+
+/* destroy the iterator */
+void cdb_iterate_destroy(CDB *db, void *iter)
+{
+    db->vio->recitdestroy(db->vio, iter);
+}
+
+
+/* difficult to implement */
+/*
+static void _cdb_rcachewarmup(CDB *db)
+{
+}
+*/
+
+
+CDB *cdb_new()
+{
+    CDB *db;
+    db = (CDB *)malloc(sizeof(CDB));
+    /* I assume all operation in this layer is 'fast', so no mutex used here */
+    for(int i = 0; i < MLOCKNUM; i++) 
+        db->mlock[i] = cdb_lock_new(CDB_LOCKSPIN);
+    db->dpclock = cdb_lock_new(CDB_LOCKSPIN);
+    db->pclock = cdb_lock_new(CDB_LOCKSPIN);
+    db->rclock = cdb_lock_new(CDB_LOCKSPIN);
+    db->stlock = cdb_lock_new(CDB_LOCKSPIN);
+    db->oidlock = cdb_lock_new(CDB_LOCKSPIN);
+    db->bflock = cdb_lock_new(CDB_LOCKSPIN);
+    db->bgtask = cdb_bgtask_new();
+    /* every thread should has its own errno */
+    db->errkey = (pthread_key_t *)malloc(sizeof(pthread_key_t));
+    pthread_key_create(db->errkey, NULL);
+    /* set default parameter */
+    _cdb_defparam(db);
+    return db;
+}
+
+
+int cdb_option(CDB *db, int bnum, int rcacheMB, int pcacheMB)
+{
+    /* too small bnum is not allowed */
+    db->hsize = bnum > 4096? bnum : 4096;
+
+    if (rcacheMB >= 0)
+        db->rclimit = (uint64_t)rcacheMB * MB;
+    if (pcacheMB >= 0)
+        db->pclimit = (uint64_t)pcacheMB * MB;
+    return 0;
+}
+
+
+void cdb_option_bloomfilter(CDB *db, uint64_t size)
+{
+    db->bfsize = size;
+}
+
+void cdb_option_areadsize(CDB *db, uint32_t size)
+{
+    db->areadsize = size;
+    if (db->areadsize < 1 * KB)
+        db->areadsize = 1 * KB;
+
+    if (db->areadsize > SBUFSIZE - (sizeof(CDBREC) - RECHSIZE)) 
+        db->areadsize = SBUFSIZE - (sizeof(CDBREC) - RECHSIZE);
+}
+
+int cdb_open(CDB *db, const char *file_name, int mode)
+{
+    /* if will become into a hash table when file_name == CDB_MEMDB */
+    int memdb = (strcmp(file_name, CDB_MEMDB) == 0);
+
+    if (db->rclimit)
+        /* record cache is enabled */
+        db->rcache = cdb_ht_new(true, NULL);
+    else if (memdb) {
+        /* record cache is disabled, but in MEMDB mode */
+        cdb_seterrno(db, CDB_MEMDBNOCACHE, __FILE__, __LINE__);
+        goto ERRRET;
+    }
+
+    if (db->pclimit && !memdb) {
+        /* page cache enabled. page cache is meaningless under MEMDB  mode */
+        db->dpcache = cdb_ht_new(true, _pagehash);
+        db->pcache = cdb_ht_new(true, _pagehash);
+    }
+
+
+    if (!memdb) {
+        if (db->bfsize) {
+            /* bloom filter enabled */
+            db->bf = cdb_bf_new(db->bfsize, db->bfsize);
+        }
+        /* now only one storage format is supported */
+        db->vio = cdb_vio_new(CDBVIOAPND2);
+        db->vio->db = db;
+        if (db->vio->open(db->vio, file_name, mode) < 0)
+            goto ERRRET;
+        if (db->vio->rhead(db->vio) < 0) {
+            db->mtable = (FOFF*)malloc(sizeof(FOFF) * db->hsize);
+            memset(db->mtable, 0, sizeof(FOFF) * db->hsize);
+        }
+        /* dirty index page would be swap to disk by timer control */
+        cdb_bgtask_add(db->bgtask, _cdb_flushdpagetask, db, 1);
+        db->ndpltime = time(NULL);
+        /* start background task thread */
+        cdb_bgtask_start(db->bgtask);
+    } else {
+        /* no persistent storage under MEMDB mode */
+        db->vio = NULL;
+        db->bgtask = NULL;
+        db->mtable = NULL;
+    }
+
+    if (db->bf || ((mode & CDB_PAGEWARMUP) && db->pcache)) {
+        /* fill the bloom filter if it is enabled, and fill the page cache */
+        _cdb_pagewarmup(db, !!db->bf);
+    }
+
+    /* reset the statistic info */
+    cdb_stat(db, NULL);
+    db->opened = true;
+    return 0;
+
+ERRRET:
+    if (db->rcache)
+        cdb_ht_destroy(db->rcache);
+    if (db->pcache)
+        cdb_ht_destroy(db->pcache);
+    if (db->dpcache)
+        cdb_ht_destroy(db->dpcache);
+    if (db->bf)
+        cdb_bf_destroy(db->bf);
+    cdb_bgtask_stop(db->bgtask);
+    _cdb_defparam(db);
+    return -1;
+}
+
+
+/* check if the page cache size exceed the limit. clean oldest page if necessary */
+static void _cdb_pageout(CDB *db)
+{
+    while (PCOVERFLOW(db)) {
+        if (db->pcache->num) {
+            /* clean page cache is prior */
+            cdb_lock_lock(db->pclock);
+            cdb_ht_removetail(db->pcache);
+            cdb_lock_unlock(db->pclock);
+        } else if (db->dpcache->num) {
+            CDBHTITEM *item;
+            uint32_t bid;
+            FOFF off;
+            cdb_lock_lock(db->dpclock);
+            item = cdb_ht_gettail(db->dpcache);    
+            if (item == NULL) {
+                cdb_lock_unlock(db->dpclock);
+                break;
+            }
+
+            bid = *(uint32_t*)cdb_ht_itemkey(db->dpcache, item);
+            /* must lock the main table inside the dpclock protection */
+            if (cdb_lock_trylock(db->mlock[bid % MLOCKNUM]) < 0) {
+                /* avoid dead lock since dpclock is holding */
+                cdb_lock_unlock(db->dpclock);
+                /* do nothing this time */
+                break;
+            }
+            cdb_ht_poptail(db->dpcache);
+            cdb_lock_unlock(db->dpclock);
+
+            /* write out dirty page */
+            struct timespec ts;
+            _cdb_timerreset(&ts);
+            db->vio->wpage(db->vio, (CDBPAGE*)cdb_ht_itemval(db->dpcache, item), &off);
+            db->wcount++;
+            db->wtime += _cdb_timermicrosec(&ts);
+            db->mtable[bid] = off;
+            cdb_lock_unlock(db->mlock[bid % MLOCKNUM]);
+            free(item);
+        }
+    }
+}
+
+
+/* check if the record cache size exceed the limit. clean oldest record if necessary */
+static void _cdb_recout(CDB *db)
+{
+    while (RCOVERFLOW(db)) {
+        cdb_lock_lock(db->rclock);
+        if (db->rcache->num)
+            cdb_ht_removetail(db->rcache);
+        cdb_lock_unlock(db->rclock);
+    }
+}
+
+
+/* get all offsets from index(page) by key, even if only one of them at most is valid.
+ Others are due to the hash collision */
+int cdb_getoff(CDB *db, uint64_t hash, FOFF **offs, int locked) 
+{
+    char sbuf[SBUFSIZE];
+    CDBPAGE *page = NULL;
+    int rnum;
+    bool incache = true;
+    uint32_t bid = (hash >> 24) % db->hsize;
+    PHASH phash;
+
+    phash.i1 = hash & 0xff;
+    phash.i2 = (hash >> 8) & 0xffff;
+
+    if (db->bf) {
+        uint64_t bfkey = (bid << 24) | (hash & 0xffffff);
+        /* check the key-hash in bloom filter? return now if not exist */
+        cdb_lock_lock(db->bflock);
+        if (!cdb_bf_exist(db->bf, &bfkey, SI8)) {
+            cdb_lock_unlock(db->bflock);
+            return 0;
+        }
+        cdb_lock_unlock(db->bflock);
+    }
+
+    if (locked == CDB_NOTLOCKED) cdb_lock_lock(db->mlock[bid % MLOCKNUM]);
+    /* page exists in clean page cache? */
+    if (db->pcache) {
+        cdb_lock_lock(db->pclock);
+        page = cdb_ht_get2(db->pcache, &bid, SI4, true);
+        cdb_lock_unlock(db->pclock);
+    }
+
+    /* not in pcache, exists in dirty page cache? */
+    if (page == NULL && db->dpcache) {
+        cdb_lock_lock(db->dpclock);
+        page = cdb_ht_get2(db->dpcache, &bid, SI4, true);
+        cdb_lock_unlock(db->dpclock);
+    }
+
+    if (page == NULL) {
+        /* not in dpcache either, read from disk */
+        incache = false;
+        db->pcmiss++;
+        /* page stays in stack by default */
+        page = (CDBPAGE *)sbuf;
+        if (OFFNOTNULL(db->mtable[bid])) {
+            /* page offset not null in main table */
+            int ret;
+            struct timespec ts;
+            _cdb_timerreset(&ts);
+            ret = db->vio->rpage(db->vio, &page, db->mtable[bid]);
+            db->rcount++;
+            db->rtime += _cdb_timermicrosec(&ts);
+
+            /* read page error, return */
+            if (ret < 0) {
+                if (locked == CDB_NOTLOCKED) cdb_lock_unlock(db->mlock[bid % MLOCKNUM]);
+                if (page != (CDBPAGE *)sbuf)
+                    free(page);
+                return -1;
+            }
+        } else {
+            /* no page in this bucket */
+            page->cap = page->num = 0;
+            page->osize = 0;
+            OFFZERO(page->ooff);
+        }
+    } else {
+        db->pchit++;
+    }
+
+    rnum = 0;
+    for(uint32_t i = 0; i < page->num; i++) {
+        /* compare every hash in the page */
+        if (PHASHEQ(page->items[i].hash, phash)) {
+            (*offs)[rnum] = page->items[i].off;
+            /* result offset list stays in stack by default. Allocate one in heap if 
+            it exceeds the limit */
+            if (++rnum == SFOFFNUM) {
+                /* very little possibility goes here */
+                FOFF *tmp = (FOFF*)malloc((page->num - i + SFOFFNUM + 1) * sizeof(FOFF));
+                memcpy(tmp, *offs, SFOFFNUM * sizeof(FOFF));
+                *offs = tmp;
+            } 
+        }
+    }
+
+    if (!incache) {
+        /* set into clean page cache if not exists before */
+        if (db->pcache) {
+            cdb_lock_lock(db->pclock);
+            cdb_ht_insert2(db->pcache, &bid, SI4, page, MPAGESIZE(page));
+            cdb_lock_unlock(db->pclock);
+        }
+        /* if page now points to heap memory, free it */
+        if (page != (CDBPAGE *)sbuf) {
+            free(page);
+        }
+    }
+    if (locked == CDB_NOTLOCKED) cdb_lock_unlock(db->mlock[bid % MLOCKNUM]);
+
+    /* check page cache overflow */
+    if (PCOVERFLOW(db))
+        _cdb_pageout(db);
+
+    return rnum;
+}
+
+
+/* replace a specified record's offset, may be used at disk space recycling 
+ off indicates its previous offset, noff is the new offset. return negative if not found */
+int cdb_replaceoff(CDB *db, uint64_t hash, FOFF off, FOFF noff, int locked)
+{
+    char sbuf[SBUFSIZE];
+    CDBPAGE *page = NULL;
+    CDBHTITEM *pitem = NULL;
+    bool indpcache = false;
+    uint32_t bid = (hash >> 24) % db->hsize;
+    PHASH phash;
+    bool found = false;
+
+    phash.i1 = hash & 0xff;
+    phash.i2 = (hash >> 8) & 0xffff;
+
+    if (locked == CDB_NOTLOCKED) cdb_lock_lock(db->mlock[bid % MLOCKNUM]);
+    if (db->pcache) {
+        /* in clean page cache, since it would be modified, it should be deleted from pcache */
+        cdb_lock_lock(db->pclock);
+        pitem = cdb_ht_del(db->pcache, &bid, SI4);
+        cdb_lock_unlock(db->pclock);
+        if (pitem)
+            page = (CDBPAGE *)cdb_ht_itemval(db->pcache, pitem);
+    }
+    if (page == NULL && db->dpcache) {
+        /* not in pcache, but in dirty page cache */
+        cdb_lock_lock(db->dpclock);
+        page = cdb_ht_get2(db->dpcache, &bid, SI4, true);
+        cdb_lock_unlock(db->dpclock);
+        if (page)
+            indpcache = true;
+    }
+    if (page == NULL) {
+        /* not exists either, read from disk */
+        db->pcmiss++;
+        page = (CDBPAGE *)sbuf;
+        if (OFFNOTNULL(db->mtable[bid])) {
+            int ret;
+            struct timespec ts;
+            _cdb_timerreset(&ts);
+            ret = db->vio->rpage(db->vio, &page, db->mtable[bid]);
+            db->rcount++;
+            db->rtime += _cdb_timermicrosec(&ts);
+            
+            if (ret < 0) {
+                if (locked == CDB_NOTLOCKED) cdb_lock_unlock(db->mlock[bid % MLOCKNUM]);
+                if (page != (CDBPAGE *)sbuf)
+                    free(page);
+                return -1;
+            }
+        } else {
+            /* nullified the empty page */
+            page->cap = page->num = 0;
+            page->osize = 0;
+            OFFZERO(page->ooff);
+        }
+    } else {
+        db->pchit++;
+    }
+
+    /* check and modify */
+    for(uint32_t i = 0; i < page->num; i++) {
+        if (PHASHEQ(page->items[i].hash, phash)
+            && OFFEQ(page->items[i].off, off)) {
+                page->items[i].off = noff;
+                found = true;
+                break;
+        }
+    }
+
+    if (db->dpcache && !indpcache) {
+        /* if page already dirty in cache, need not do anything */
+        /* dirty page cache is enabled but not exists before */
+        if (pitem) {
+            /* pitem not NULL indicates it belongs to pcache */
+            if (found) {
+                /* modified page */
+                cdb_lock_lock(db->dpclock);
+                cdb_ht_insert(db->dpcache, pitem);
+                cdb_lock_unlock(db->dpclock);
+            } else {
+                /* got from pcache, but not modified */
+                cdb_lock_lock(db->pclock);
+                cdb_ht_insert(db->pcache, pitem);
+                cdb_lock_unlock(db->pclock);
+            }
+            /* page belongs to memory in 'cache', must not free */
+        } else if (page != NULL) {
+            /* page read from disk, but not in cache */
+            cdb_lock_lock(db->dpclock);
+            cdb_ht_insert2(db->dpcache, &bid, SI4, page, MPAGESIZE(page));
+            cdb_lock_unlock(db->dpclock);
+            /* the 'page' won't be use anymore */
+            if (page != (CDBPAGE *)sbuf) 
+                free(page);
+        }
+    } else if (!db->dpcache){
+        /* no page cache. Write out dirty page immediately */
+        FOFF poff;
+        struct timespec ts;
+        _cdb_timerreset(&ts);
+        db->vio->wpage(db->vio, page, &poff);
+        db->wcount++;
+        db->wtime += _cdb_timermicrosec(&ts);
+
+        db->mtable[bid] = poff;
+        if (page != (CDBPAGE *)sbuf) 
+                free(page);
+    }
+    if (locked == CDB_NOTLOCKED) cdb_lock_unlock(db->mlock[bid % MLOCKNUM]);
+
+    /* check page cache overflow */
+    if (PCOVERFLOW(db))
+        _cdb_pageout(db);
+
+    return 0;
+}
+
+
+/* insert/delete a key-offset pair from index page */
+int cdb_updatepage(CDB *db, uint64_t hash, FOFF off, int opt, int locked)
+{
+    char sbuf[SBUFSIZE], sbuf2[SBUFSIZE];
+    CDBPAGE *page = NULL, *npage = NULL;
+    CDBHTITEM *pitem = NULL, *nitem = NULL;
+    CDBHASHTABLE *tmpcache = NULL;
+    CDBLOCK *tmpclock = NULL;
+    int npsize = 0;
+    uint32_t bid = (hash >> 24) % db->hsize;
+    PHASH phash;
+
+    phash.i1 = hash & 0xff;
+    phash.i2 = (hash >> 8) & 0xffff;
+
+    if (locked == CDB_NOTLOCKED) cdb_lock_lock(db->mlock[bid % MLOCKNUM]);
+    /* firstly, try move the page out of the cache if possible, 
+    it assumes that the page would be modified(pair exists) */
+    if (db->pcache) {
+        /* try clean page cache */
+        cdb_lock_lock(db->pclock);
+        pitem = cdb_ht_del(db->pcache, &bid, SI4);
+        cdb_lock_unlock(db->pclock);
+        if (pitem) {
+            page = (CDBPAGE *)cdb_ht_itemval(db->pcache, pitem);
+            tmpcache = db->pcache;
+            tmpclock = db->pclock;
+        }
+    }
+    if (page == NULL && db->dpcache) {
+        /* try dirty page cache */
+        cdb_lock_lock(db->dpclock);
+        pitem = cdb_ht_del(db->dpcache, &bid, SI4);
+        cdb_lock_unlock(db->dpclock);
+        if (pitem) {
+            page = (CDBPAGE *)cdb_ht_itemval(db->dpcache, pitem);
+            tmpcache = db->dpcache;
+            tmpclock = db->dpclock;
+        }
+    }
+
+    if (page == NULL) {
+        db->pcmiss++;
+        page = (CDBPAGE *)sbuf;
+        /* doesn't exist in cache, read from disk */
+        if (OFFNOTNULL(db->mtable[bid])) {
+            int ret;
+            struct timespec ts;
+            _cdb_timerreset(&ts);
+            ret = db->vio->rpage(db->vio, &page, db->mtable[bid]);
+            db->rcount++;
+            db->rtime += _cdb_timermicrosec(&ts);
+
+            if (ret < 0) {
+                if (locked == CDB_NOTLOCKED) cdb_lock_unlock(db->mlock[bid % MLOCKNUM]);
+                if (page != (CDBPAGE *)sbuf)
+                    free(page);
+                return -1;
+            }
+        } else {
+            page->cap = 0;
+            page->num = 0;
+            page->osize = 0;
+            OFFZERO(page->ooff);
+        }
+    } else {
+        db->pchit++;
+    }
+
+    npsize = MPAGESIZE(page);
+
+    if (opt == CDB_PAGEDELETEOFF)
+    ;//    npsize = MPAGESIZE(page) - sizeof(PITEM);
+    /* do not malloc new page on deletion */
+
+    else if (opt == CDB_PAGEINSERTOFF && page->cap == page->num) {
+    /* get a new page, from dirty page cache if possible */
+        npsize = MPAGESIZE(page) + CDB_PAGEINCR * sizeof(PITEM);
+        if (db->dpcache) {
+            nitem = cdb_ht_newitem(db->dpcache, SI4, npsize);
+            *(uint32_t*)cdb_ht_itemkey(db->dpcache, nitem) = bid;
+            npage = (CDBPAGE *)cdb_ht_itemval(db->dpcache, nitem);
+        } else {
+            /* no dpcache, use stack if size fits */
+            if (npsize > SBUFSIZE) 
+                npage = (CDBPAGE *)malloc(npsize);
+            else
+                npage = (CDBPAGE *)sbuf2;
+        }
+
+        /* initialize the new page */
+    
+        npage->bid = bid;
+        npage->oid = cdb_genoid(db);
+        npage->osize = page->osize;
+        npage->ooff = page->ooff;
+        npage->mtime = time(NULL);
+        npage->cap = page->cap + CDB_PAGEINCR;
+        npage->num = page->num;
+        memcpy(npage->items, page->items, page->num * sizeof(PITEM)); 
+        /* old page got from cache */
+        if (pitem)
+            free(pitem);
+        /* old page read from disk, if in stack? */
+        else if (page != (CDBPAGE *)sbuf)
+            free(page);
+
+        page = npage;
+        pitem = nitem;
+    }
+
+    uint32_t onum = page->num;
+
+    if (opt == CDB_PAGEDELETEOFF) {
+        bool found = false;
+        for(uint32_t i = 0; i < page->num; i++) {
+            if (!found) {
+                if (PHASHEQ(page->items[i].hash, phash)
+                    && OFFEQ(page->items[i].off, off))
+                {
+                    found = true;
+                    /* records num is consistant with index */
+                    cdb_lock_lock(db->stlock);
+                    db->rnum--;
+                    cdb_lock_unlock(db->stlock);
+                }
+            }
+            if (found && i + 1 < page->num)
+                page->items[i] = page->items[i+1];
+        }
+        if (found)
+            page->num--;
+    } else if (opt == CDB_PAGEINSERTOFF) {
+        bool found = false;
+        /* check already exist? */
+        for(uint32_t i = 0; i < page->num; i++) {
+            if (PHASHEQ(page->items[i].hash, phash)
+                && OFFEQ(page->items[i].off, off)) {
+                /* avoid exceptional deduplicated item */
+                found = true;
+                break;
+            }
+        }
+
+        /* append to the tail */
+        if (!found) {
+            page->items[page->num].hash = phash;
+            page->items[page->num].off = off;
+            page->num++;
+            /* records num is consistant with index */
+            cdb_lock_lock(db->stlock);
+            db->rnum++;
+            cdb_lock_unlock(db->stlock);
+            if (db->bf) {
+                uint64_t bfkey = (((hash >> 24) % db->hsize) << 24) | (hash & 0xffffff);
+                cdb_lock_lock(db->bflock);
+                cdb_bf_set(db->bf, &bfkey, SI8);
+                cdb_lock_unlock(db->bflock);
+            }
+        }
+    }
+
+    if (page->num == onum) {
+        /* nothing done */
+        if (pitem) {
+            /* insert the item back to the cache where it belongs */
+            cdb_lock_lock(tmpclock);
+            cdb_ht_insert(tmpcache, pitem);
+            cdb_lock_unlock(tmpclock);
+        } else {
+            if (page != (CDBPAGE *)sbuf2
+                    && page != (CDBPAGE *)sbuf)
+                free(page);
+        }
+        if (locked == CDB_NOTLOCKED) cdb_lock_unlock(db->mlock[bid % MLOCKNUM]);
+        return -1;
+    } else {
+        if (pitem) {
+            cdb_lock_lock(db->dpclock);
+            cdb_ht_insert(db->dpcache, pitem);
+            cdb_lock_unlock(db->dpclock);
+        } else {
+            struct timespec ts;
+            _cdb_timerreset(&ts);
+            db->vio->wpage(db->vio, page, &off);
+            db->wcount++;
+            db->wtime += _cdb_timermicrosec(&ts);
+
+            db->mtable[bid] = off;
+            if (page != (CDBPAGE *)sbuf2
+                    && page != (CDBPAGE *)sbuf)
+                free(page);
+        }
+    }
+
+    if (locked == CDB_NOTLOCKED) cdb_lock_unlock(db->mlock[bid % MLOCKNUM]);
+
+    /* check page cache overflow */
+    if (PCOVERFLOW(db))
+        _cdb_pageout(db);
+
+    return 0;
+}
+
+
+/* check if an record with specified key-offset exists in index */
+bool cdb_checkoff(CDB *db, uint64_t hash, FOFF off, int locked)
+{
+    FOFF soffs[SFOFFNUM];
+    FOFF *soff = (FOFF *)soffs;
+    int dupnum;
+    int ret = false;
+
+    /* get all possible offsets */
+    dupnum = cdb_getoff(db, hash, &soff, locked);
+    for(int i = 0; i < dupnum; i++) {
+        if (OFFEQ(soff[i], off)) {
+            ret = true;
+            break;
+        }
+    }
+
+    if (soff != (FOFF *)soffs) {
+        free(soff);
+    }
+
+    return ret;
+}
+
+
+/* wrapper and simplified of set operation */
+int cdb_set(CDB *db, const char *key, int ksize, const char *val, int vsize)
+{
+    return cdb_set2(db, key, ksize, val, vsize, CDB_OVERWRITE, 0);
+}
+
+
+int cdb_set2(CDB *db, const char *key, int ksize, const char *val, int vsize, int opt, int expire)
+{
+    CDBREC rec;
+    FOFF ooff, noff;
+    uint32_t now = time(NULL);
+    uint64_t hash;
+    uint32_t lockid;
+    bool expired = false;
+ 
+    if (db->vio == NULL) {
+        /* if it is a memdb, just operate on the record cache and return */
+        cdb_lock_lock(db->rclock);
+        cdb_ht_insert2(db->rcache, key, ksize, val, vsize);
+        cdb_lock_unlock(db->rclock);
+        if (RCOVERFLOW(db))
+            _cdb_recout(db);
+        return 0;
+    }
+
+    hash = CDBHASH64(key, ksize);
+    lockid = (hash >> 24) % db->hsize % MLOCKNUM;
+    OFFZERO(rec.ooff);
+    OFFZERO(ooff);
+    rec.osize = 0;
+    rec.key = (char*)key;
+    rec.val = (char*)val;
+    rec.ksize = ksize;
+    rec.vsize = vsize;
+    rec.oid = cdb_genoid(db);
+    rec.expire = expire? now + expire : 0;
+        
+    cdb_lock_lock(db->mlock[lockid]);
+    if (db->rcache) {
+        /* if record already exists, get its old meta info */
+        int item_vsize;
+        char *cval;
+        uint32_t old_expire = 0;
+        cdb_lock_lock(db->rclock);
+        cval = cdb_ht_get(db->rcache, key, ksize, &item_vsize, false);
+        if (cval) {
+            /* record already exists */
+            ooff = rec.ooff = *(FOFF*)cval;
+            rec.osize = item_vsize - SFOFF - SI4;
+            old_expire = *(uint32_t*)(cval + SFOFF); 
+        }
+        cdb_lock_unlock(db->rclock);
+        if (old_expire && old_expire <= now)
+            /* once exist but expired? */
+            expired = true;
+    }
+    
+    if (OFFNULL(ooff)) {
+        FOFF soffs[SFOFFNUM];
+        FOFF *soff = soffs;
+        char sbuf[SBUFSIZE];
+        CDBREC *rrec = (CDBREC*)sbuf;
+        
+        int retnum;
+        if ((retnum = cdb_getoff(db, hash, &soff, CDB_LOCKED)) < 0) {
+            cdb_lock_unlock(db->mlock[lockid]);
+            return -1;
+        }
+            
+        for(int i = 0; i < retnum; i++) {
+            /* check for duplicate records/older version*/
+            int cret;
+            if (rrec != (CDBREC*)sbuf) {
+                free(rrec);
+                rrec = (CDBREC*)sbuf;
+            }
+            
+            struct timespec ts;
+            _cdb_timerreset(&ts);
+            cret = db->vio->rrec(db->vio, &rrec, soff[i], false);
+            db->rcount++;
+            db->rtime += _cdb_timermicrosec(&ts);
+            
+            if (cret < 0)
+                continue;
+                
+            if (ksize == rrec->ksize && memcmp(rrec->key, key, ksize) == 0) {
+                /* got its old meta info */
+                rec.osize = rrec->osize;
+                rec.ooff = rrec->ooff;
+                ooff = rec.ooff;
+                if (rrec->expire <= now)
+                    expired = true;
+                break;
+            }
+        }
+        if (soff != soffs)
+            free(soff);
+        if (rrec != (CDBREC*)sbuf) 
+            free(rrec);
+    }
+    
+    if (OFFNOTNULL(ooff) && !expired) {
+        /* record already exists*/
+        if (opt & CDB_INSERTIFNOEXIST) {
+            cdb_lock_unlock(db->mlock[lockid]);
+            cdb_seterrno(db, CDB_EXIST, __FILE__, __LINE__);
+            return -2;
+        }
+    } else {
+        if (opt & CDB_INSERTIFEXIST) {
+            cdb_lock_unlock(db->mlock[lockid]);
+            cdb_seterrno(db, CDB_NOTFOUND, __FILE__, __LINE__);
+            return -3;
+        }
+    }
+    
+    struct timespec ts;
+    _cdb_timerreset(&ts);
+    if (db->vio->wrec(db->vio, &rec, &noff) < 0) {
+        cdb_lock_unlock(db->mlock[lockid]);
+        return -1;
+    }
+    db->wcount++;
+    db->wtime += _cdb_timermicrosec(&ts);
+    
+    if (OFFNOTNULL(ooff)) {
+        cdb_replaceoff(db, hash, ooff, noff, CDB_LOCKED);
+    } else {
+        cdb_updatepage(db, hash, noff, CDB_PAGEINSERTOFF, CDB_LOCKED);
+    }
+    
+    if (db->rcache) {
+        if ((opt & CDB_INSERTCACHE) == CDB_INSERTCACHE) {
+            char *cval;
+            CDBHTITEM *item = cdb_ht_newitem(db->rcache, ksize, vsize + SI4 + SFOFF);
+            memcpy(cdb_ht_itemkey(db->rcache, item), key, ksize);
+            cval = cdb_ht_itemval(db->rcache, item);
+            memcpy(cval + SI4 + SFOFF, val, vsize);
+            *(FOFF*)(cval) = rec.ooff;
+            *(uint32_t*)(cval + SFOFF) = rec.expire;
+            cdb_lock_lock(db->rclock);
+            cdb_ht_insert(db->rcache, item);
+            cdb_lock_unlock(db->rclock);
+        }
+    } 
+    cdb_lock_unlock(db->mlock[lockid]);
+    
+    if (RCOVERFLOW(db))
+        _cdb_recout(db);
+
+    cdb_seterrno(db, CDB_SUCCESS, __FILE__, __LINE__);
+    return 0;
+}
+
+
+
+int cdb_get(CDB *db, const char *key, int ksize, void **val, int *vsize)
+{
+    char sbuf[SBUFSIZE];
+    CDBREC *rec = (CDBREC *)sbuf;
+    FOFF soffs[SFOFFNUM];
+    FOFF *offs;
+    int dupnum, ret = -3;
+    uint64_t hash;
+    uint32_t now = time(NULL);
+    uint32_t lockid;
+
+    *vsize = 0;
+    *val = NULL;
+    if (db->rcache) {
+        char *cval;
+        cdb_lock_lock(db->rclock);
+        cval = cdb_ht_get(db->rcache, key, ksize, vsize, true);
+        if (cval) {
+            db->rchit++;
+            if (db->vio) {
+                (*vsize) -= SI4 + SFOFF;
+                if (*(uint32_t*)(cval + SFOFF)
+                    && *(uint32_t*)(cval + SFOFF) <= now) {
+                    cdb_lock_unlock(db->rclock);
+                    /* not found no not report error now */
+                    //cdb_seterrno(db, CDB_NOTFOUND, __FILE__, __LINE__);
+                    return -3;
+                }
+                cval = (void*)(cval + SI4 + SFOFF);
+            }
+            *val = malloc(*vsize);
+            memcpy(*val, cval, *vsize);
+            cdb_lock_unlock(db->rclock);
+            return 0;
+        } else {
+            db->rcmiss++;
+            if (db->vio == NULL) {
+                cdb_lock_unlock(db->rclock);
+                return -3;
+            }
+        }
+        cdb_lock_unlock(db->rclock);
+    }
+
+    offs = soffs;
+    hash = CDBHASH64(key, ksize);
+    lockid = (hash >> 24) % db->hsize % MLOCKNUM;
+    cdb_lock_lock(db->mlock[lockid]);
+    dupnum = cdb_getoff(db, hash, &offs, CDB_LOCKED);
+    if (dupnum < 0) {
+        cdb_lock_unlock(db->mlock[lockid]);
+        return -1;
+    }
+
+    for(int i = 0; i < dupnum; i++) {
+        int cret;
+        if (rec != (CDBREC*)sbuf) {
+            free(rec);
+            rec = (CDBREC*)sbuf;
+        }
+
+        struct timespec ts;
+        _cdb_timerreset(&ts);
+        cret = db->vio->rrec(db->vio, &rec, offs[i], true);
+        db->rcount++;
+        db->rtime += _cdb_timermicrosec(&ts);
+
+        if (cret < 0)
+            continue;
+
+        if (ksize == rec->ksize && memcmp(rec->key, key, ksize) == 0) {
+            if (rec->expire && rec->expire <= now) {
+                break;
+            }
+            *vsize = rec->vsize;
+            *val = malloc(*vsize);
+            memcpy(*val, rec->val, *vsize);
+            ret = 0;
+            break;
+        } 
+    }
+
+    if (ret == 0 && db->rcache) {
+        char *cval;
+        CDBHTITEM *item = cdb_ht_newitem(db->rcache, ksize, *vsize + SI4 + SFOFF);
+        memcpy(cdb_ht_itemkey(db->rcache, item), key, ksize);
+        cval = cdb_ht_itemval(db->rcache, item);
+        memcpy(cval + SI4 + SFOFF, *val, *vsize);
+        *(FOFF*)(cval) = rec->ooff;
+        *(uint32_t*)(cval + SFOFF) = rec->expire;
+        cdb_lock_lock(db->rclock);
+        cdb_ht_insert(db->rcache, item);
+        cdb_lock_unlock(db->rclock);
+    }
+    cdb_lock_unlock(db->mlock[lockid]);
+    
+    if (RCOVERFLOW(db))
+        _cdb_recout(db);
+            
+    if (offs != soffs)
+        free(offs);
+        
+    if (rec != (CDBREC*)sbuf) 
+        free(rec);
+
+    if (ret < 0)
+        cdb_seterrno(db, CDB_NOTFOUND, __FILE__, __LINE__);
+    else {
+        db->rcmiss++;
+        cdb_seterrno(db, CDB_SUCCESS, __FILE__, __LINE__);
+    }
+    return ret;
+}
+
+
+void cdb_free_val(void **val)
+{
+    if (*val) 
+        free(*val);
+    *val = NULL;
+}
+
+
+int cdb_del(CDB *db, const char *key, int ksize)
+{
+    FOFF ooff;
+    CDBREC rec;
+    uint32_t lockid;
+    uint64_t hash;
+    
+    OFFZERO(rec.ooff);
+    OFFZERO(ooff);
+    rec.osize = 0;
+    rec.key = (char*)key;
+    rec.ksize = ksize;
+    rec.val = NULL;
+    rec.vsize = 0;
+    
+    if (db->vio == NULL) {
+        /* if it is a memdb, just operate on the record cache and return */
+        cdb_lock_lock(db->rclock);
+        cdb_ht_del2(db->rcache, key, ksize);
+        cdb_lock_unlock(db->rclock);
+        if (RCOVERFLOW(db))
+            _cdb_recout(db);
+        return 0;
+    }
+    
+    hash = CDBHASH64(key, ksize);
+    lockid = (hash >> 24) % db->hsize % MLOCKNUM;
+    cdb_lock_lock(db->mlock[lockid]);
+    if (db->rcache) {
+        /* if record already exists, get its old meta info */
+        CDBHTITEM *item;
+        cdb_lock_lock(db->rclock);
+        item = cdb_ht_del(db->rcache, key, ksize);
+        cdb_lock_unlock(db->rclock);
+        if (item) {
+            char *cval = cdb_ht_itemval(db->rcache, item);
+            ooff = rec.ooff = *(FOFF*)cval;
+            rec.osize = item->vsize - SFOFF - SI4;
+            rec.expire = *(uint32_t*)(cval + SFOFF);
+            free(item);
+        }
+    }
+    
+    if (OFFNULL(ooff)) {
+        FOFF soffs[SFOFFNUM];
+        FOFF *soff = soffs;
+        char sbuf[SBUFSIZE];
+        CDBREC *rrec = (CDBREC*)sbuf;
+        
+        int retnum;
+        if ((retnum = cdb_getoff(db, hash, &soff, CDB_LOCKED)) < 0) {
+            cdb_lock_unlock(db->mlock[lockid]);
+            return -1;
+        }
+            
+        for(int i = 0; i < retnum; i++) {
+            /* check for duplicate records/older version*/
+            int cret;
+            if (rrec != (CDBREC*)sbuf) {
+                free(rrec);
+                rrec = (CDBREC*)sbuf;
+            }
+            
+            struct timespec ts;
+            _cdb_timerreset(&ts);
+            cret = db->vio->rrec(db->vio, &rrec, soff[i], false);
+            db->rcount++;
+            db->rtime += _cdb_timermicrosec(&ts);
+            
+            if (cret < 0)
+                continue;
+                
+            if (ksize == rrec->ksize && memcmp(rrec->key, key, ksize) == 0) {
+                /* got its old meta info */
+                rec.osize = rrec->osize;
+                rec.ooff = rrec->ooff;
+                ooff = rec.ooff;
+                break;
+            }
+        }
+        if (soff != soffs)
+            free(soff);
+        if (rrec != (CDBREC*)sbuf) 
+            free(rrec);
+    }
+    
+    if (OFFNOTNULL(ooff)) {
+        cdb_updatepage(db, hash, ooff, CDB_PAGEDELETEOFF, CDB_LOCKED);
+        cdb_lock_unlock(db->mlock[lockid]);
+        
+        struct timespec ts;
+        _cdb_timerreset(&ts);
+        if (db->vio->drec(db->vio, &rec, ooff) < 0)
+            ; // return -1;  succeed or not doesn't matter
+        db->wcount++;
+        db->wtime += _cdb_timermicrosec(&ts);
+        cdb_seterrno(db, CDB_SUCCESS, __FILE__, __LINE__);
+        return 0;
+    } else {
+        cdb_lock_unlock(db->mlock[lockid]);
+        cdb_seterrno(db, CDB_NOTFOUND, __FILE__, __LINE__);
+        return -3;
+    }
+}
+
+
+void cdb_stat(CDB *db, CDBSTAT *stat)
+{
+    if (stat == NULL) {
+        db->rchit = db->rcmiss = 0;
+        db->pchit = db->pcmiss = 0;
+        db->rcount = db->rtime = 0;
+        db->wcount = db->wtime = 0;
+    } else {
+        stat->rnum = db->rnum;
+        stat->rcnum = db->rcache? db->rcache->num : 0;
+        stat->pnum = db->hsize;
+        stat->pcnum = (db->pcache? db->pcache->num : 0) 
+            + (db->dpcache? db->dpcache->num : 0);
+        stat->rchit = db->rchit;
+        stat->rcmiss = db->rcmiss;
+        stat->pchit = db->pchit;
+        stat->pcmiss = db->pcmiss;
+        stat->rlatcy = db->rcount ? db->rtime / db->rcount : 0;
+        stat->wlatcy = db->wcount ? db->wtime / db->wcount : 0;
+    }
+}
+
+
+int cdb_close(CDB *db)
+{
+    if (!db->opened)
+        return -1;
+
+    if (db->bgtask)
+        cdb_bgtask_stop(db->bgtask);
+    if (db->rcache)
+        cdb_ht_destroy(db->rcache);
+    if (db->pcache)
+        cdb_ht_destroy(db->pcache);
+    if (db->dpcache) {
+        cdb_flushalldpage(db);
+        cdb_ht_destroy(db->dpcache);
+    }
+
+    if (db->vio) {
+        db->vio->whead(db->vio);
+        db->vio->close(db->vio);
+        cdb_vio_destroy(db->vio);
+    }
+    if (db->mtable)
+        free(db->mtable);
+    db->opened = false;
+    _cdb_defparam(db);
+    return 0;
+}
+
+
+void cdb_deferrorcb(void *arg, int errno, const char *file, int line)
+{
+    fprintf(stderr, "DBERR: [%s:%d] %d - %s\n", file, line, errno, cdb_errmsg(errno));
+}
+
+
+int cdb_destroy(CDB *db)
+{
+    if (db->opened)
+        cdb_close(db);
+    for(int i = 0; i < MLOCKNUM; i++)
+        cdb_lock_destory(db->mlock[i]);
+    cdb_lock_destory(db->dpclock);
+    cdb_lock_destory(db->pclock);
+    cdb_lock_destory(db->rclock);
+    cdb_lock_destory(db->stlock);
+    cdb_lock_destory(db->oidlock);
+    cdb_lock_destory(db->bflock);
+    cdb_bgtask_destroy(db->bgtask);
+    pthread_key_delete(*(pthread_key_t*)db->errkey);
+    free(db->errkey);
+    free(db);
+    return 0;
+}
+
+
+
diff --git a/libdap-cuttdb/src/cdb_core.h b/libdap-cuttdb/src/cdb_core.h
new file mode 100644
index 0000000000000000000000000000000000000000..bcd4ad9a6bef0d43993781f3707ed0f44935cae2
--- /dev/null
+++ b/libdap-cuttdb/src/cdb_core.h
@@ -0,0 +1,122 @@
+/*
+ *   CuttDB - a fast key-value storage engine
+ *
+ *
+ *   http://code.google.com/p/cuttdb/
+ *   
+ *   Copyright (c) 2012, Siyuan Fu.  All rights reserved.
+ *   Use and distribution licensed under the BSD license. 
+ *   See the LICENSE file for full text
+ *
+ *   Author: Siyuan Fu <fusiyuan2010@gmail.com>
+ *
+ */
+
+
+#ifndef _CDB_CORE_H_
+#define _CDB_CORE_H_
+#include "cuttdb.h"
+#include "cdb_types.h"
+#include "cdb_hashtable.h"
+#include "cdb_bloomfilter.h"
+#include "cdb_lock.h"
+#include "cdb_vio.h"
+#include "cdb_bgtask.h"
+#include <stdint.h>
+#include <stdbool.h>
+
+
+enum {
+    CDB_PAGEDELETEOFF = 0,
+    CDB_PAGEINSERTOFF = 1,
+};
+
+/* the DB object */
+struct CDB
+{
+    /* size limit for record cache */
+    uint64_t rclimit;
+    /* size limit for index page cache */
+    uint64_t pclimit;
+    /* size of bloom filter */
+    uint64_t bfsize;
+    /* record number in db */
+    uint64_t rnum;
+    /* always increment operation id */
+    uint64_t oid;
+    /* recovery point oid */
+    uint64_t roid;
+    /* hash table size */
+    uint32_t hsize;
+    /* last timestamp of no dirty page state */
+    uint32_t ndpltime;
+    /* currently the database opened or not */
+    bool opened;
+    /* the size for a disk seek&read, should not greater than SBUFSIZE */
+    uint32_t areadsize;
+
+    /* record cache */
+    CDBHASHTABLE *rcache;
+    /* (clean) index page cache */
+    CDBHASHTABLE *pcache;
+    /* dirty index page cache */
+    CDBHASHTABLE *dpcache;
+    /* Bloom Filter */
+    CDBBLOOMFILTER *bf;
+
+    /* lock for rcache */
+    CDBLOCK *rclock;
+    /* lock for pcache */
+    CDBLOCK *pclock;
+    /* lock for dpcache */
+    CDBLOCK *dpclock;
+    /* lock for hash table operation, split to MLOCKNUM groups */
+    CDBLOCK *mlock[MLOCKNUM];
+    /* lock for statistic */
+    CDBLOCK *stlock;
+    /* lock for operation id */
+    CDBLOCK *oidlock;
+    /* lock for bloom filter */
+    CDBLOCK *bflock;
+    /* background tasks in another thread */
+    CDBBGTASK *bgtask;
+
+    /* main hash table, contains 'hsize' elements */
+    FOFF *mtable;
+    /* disk i/o layer object */
+    CDBVIO *vio;
+
+    /* callback function when error occurs */
+    CDB_ERRCALLBACK errcb;
+    /* argument for callback function */
+    void *errcbarg;
+    /* key to get error code in current thread */
+    void *errkey;
+
+    /* statistics below, this fields have no lock protection */
+    /* record cache hit/miss */
+    uint64_t rchit;
+    uint64_t rcmiss;
+    /* page cache hit/miss */
+    uint64_t pchit;
+    uint64_t pcmiss;
+    /* cumulative disk read time */
+    uint64_t rtime;
+    /* number of disk read operation */
+    uint64_t rcount;
+    /* cumulative disk write time */
+    uint64_t wtime;
+    /* number of disk write operation */
+    uint64_t wcount;
+};
+
+
+bool cdb_checkoff(CDB *db, uint64_t hash, FOFF off, int locked);
+int cdb_getoff(CDB *db, uint64_t hash, FOFF **offs, int locked);
+int cdb_replaceoff(CDB *db, uint64_t hash, FOFF off, FOFF noff, int locked);
+int cdb_updatepage(CDB *db, uint64_t hash, FOFF off, int opt, int locked);
+void cdb_flushalldpage(CDB *db);
+uint64_t cdb_genoid(CDB *db);
+
+#endif
+
diff --git a/libdap-cuttdb/src/cdb_crc64.c b/libdap-cuttdb/src/cdb_crc64.c
new file mode 100644
index 0000000000000000000000000000000000000000..6c72eb73fb3b8aaf774cd0e87479fc0fe82c580b
--- /dev/null
+++ b/libdap-cuttdb/src/cdb_crc64.c
@@ -0,0 +1,170 @@
+/*
+ *   CuttDB - a fast key-value storage engine
+ *
+ *
+ *   http://code.google.com/p/cuttdb/
+ *   
+ *   Copyright (c) 2012, Siyuan Fu.  All rights reserved.
+ *   Use and distribution licensed under the BSD license. 
+ *   See the LICENSE file for full text
+ *
+ *   Author: Siyuan Fu <fusiyuan2010@gmail.com>
+ *
+ */
+
+
+/**************************************************************
+* *
+* Fichier : crc64.c *
+* Fonction pour calculer le CRC64 *
+* *
+**************************************************************/
+#include "cdb_crc64.h"
+
+
+#define CONST64(n) (n##ULL)
+static uint64_t CRC64_Table[256] =
+{
+    CONST64(0x0000000000000000), CONST64(0x42f0e1eba9ea3693),
+    CONST64(0x85e1c3d753d46d26), CONST64(0xc711223cfa3e5bb5),
+    CONST64(0x493366450e42ecdf), CONST64(0x0bc387aea7a8da4c),
+    CONST64(0xccd2a5925d9681f9), CONST64(0x8e224479f47cb76a),
+    CONST64(0x9266cc8a1c85d9be), CONST64(0xd0962d61b56fef2d),
+    CONST64(0x17870f5d4f51b498), CONST64(0x5577eeb6e6bb820b),
+    CONST64(0xdb55aacf12c73561), CONST64(0x99a54b24bb2d03f2),
+    CONST64(0x5eb4691841135847), CONST64(0x1c4488f3e8f96ed4),
+    CONST64(0x663d78ff90e185ef), CONST64(0x24cd9914390bb37c),
+    CONST64(0xe3dcbb28c335e8c9), CONST64(0xa12c5ac36adfde5a),
+    CONST64(0x2f0e1eba9ea36930), CONST64(0x6dfeff5137495fa3),
+    CONST64(0xaaefdd6dcd770416), CONST64(0xe81f3c86649d3285),
+    CONST64(0xf45bb4758c645c51), CONST64(0xb6ab559e258e6ac2),
+    CONST64(0x71ba77a2dfb03177), CONST64(0x334a9649765a07e4),
+    CONST64(0xbd68d2308226b08e), CONST64(0xff9833db2bcc861d),
+    CONST64(0x388911e7d1f2dda8), CONST64(0x7a79f00c7818eb3b),
+    CONST64(0xcc7af1ff21c30bde), CONST64(0x8e8a101488293d4d),
+    CONST64(0x499b3228721766f8), CONST64(0x0b6bd3c3dbfd506b),
+    CONST64(0x854997ba2f81e701), CONST64(0xc7b97651866bd192),
+    CONST64(0x00a8546d7c558a27), CONST64(0x4258b586d5bfbcb4),
+    CONST64(0x5e1c3d753d46d260), CONST64(0x1cecdc9e94ace4f3),
+    CONST64(0xdbfdfea26e92bf46), CONST64(0x990d1f49c77889d5),
+    CONST64(0x172f5b3033043ebf), CONST64(0x55dfbadb9aee082c),
+    CONST64(0x92ce98e760d05399), CONST64(0xd03e790cc93a650a),
+    CONST64(0xaa478900b1228e31), CONST64(0xe8b768eb18c8b8a2),
+    CONST64(0x2fa64ad7e2f6e317), CONST64(0x6d56ab3c4b1cd584),
+    CONST64(0xe374ef45bf6062ee), CONST64(0xa1840eae168a547d),
+    CONST64(0x66952c92ecb40fc8), CONST64(0x2465cd79455e395b),
+    CONST64(0x3821458aada7578f), CONST64(0x7ad1a461044d611c),
+    CONST64(0xbdc0865dfe733aa9), CONST64(0xff3067b657990c3a),
+    CONST64(0x711223cfa3e5bb50), CONST64(0x33e2c2240a0f8dc3),
+    CONST64(0xf4f3e018f031d676), CONST64(0xb60301f359dbe0e5),
+    CONST64(0xda050215ea6c212f), CONST64(0x98f5e3fe438617bc),
+    CONST64(0x5fe4c1c2b9b84c09), CONST64(0x1d14202910527a9a),
+    CONST64(0x93366450e42ecdf0), CONST64(0xd1c685bb4dc4fb63),
+    CONST64(0x16d7a787b7faa0d6), CONST64(0x5427466c1e109645),
+    CONST64(0x4863ce9ff6e9f891), CONST64(0x0a932f745f03ce02),
+    CONST64(0xcd820d48a53d95b7), CONST64(0x8f72eca30cd7a324),
+    CONST64(0x0150a8daf8ab144e), CONST64(0x43a04931514122dd),
+    CONST64(0x84b16b0dab7f7968), CONST64(0xc6418ae602954ffb),
+    CONST64(0xbc387aea7a8da4c0), CONST64(0xfec89b01d3679253),
+    CONST64(0x39d9b93d2959c9e6), CONST64(0x7b2958d680b3ff75),
+    CONST64(0xf50b1caf74cf481f), CONST64(0xb7fbfd44dd257e8c),
+    CONST64(0x70eadf78271b2539), CONST64(0x321a3e938ef113aa),
+    CONST64(0x2e5eb66066087d7e), CONST64(0x6cae578bcfe24bed),
+    CONST64(0xabbf75b735dc1058), CONST64(0xe94f945c9c3626cb),
+    CONST64(0x676dd025684a91a1), CONST64(0x259d31cec1a0a732),
+    CONST64(0xe28c13f23b9efc87), CONST64(0xa07cf2199274ca14),
+    CONST64(0x167ff3eacbaf2af1), CONST64(0x548f120162451c62),
+    CONST64(0x939e303d987b47d7), CONST64(0xd16ed1d631917144),
+    CONST64(0x5f4c95afc5edc62e), CONST64(0x1dbc74446c07f0bd),
+    CONST64(0xdaad56789639ab08), CONST64(0x985db7933fd39d9b),
+    CONST64(0x84193f60d72af34f), CONST64(0xc6e9de8b7ec0c5dc),
+    CONST64(0x01f8fcb784fe9e69), CONST64(0x43081d5c2d14a8fa),
+    CONST64(0xcd2a5925d9681f90), CONST64(0x8fdab8ce70822903),
+    CONST64(0x48cb9af28abc72b6), CONST64(0x0a3b7b1923564425),
+    CONST64(0x70428b155b4eaf1e), CONST64(0x32b26afef2a4998d),
+    CONST64(0xf5a348c2089ac238), CONST64(0xb753a929a170f4ab),
+    CONST64(0x3971ed50550c43c1), CONST64(0x7b810cbbfce67552),
+    CONST64(0xbc902e8706d82ee7), CONST64(0xfe60cf6caf321874),
+    CONST64(0xe224479f47cb76a0), CONST64(0xa0d4a674ee214033),
+    CONST64(0x67c58448141f1b86), CONST64(0x253565a3bdf52d15),
+    CONST64(0xab1721da49899a7f), CONST64(0xe9e7c031e063acec),
+    CONST64(0x2ef6e20d1a5df759), CONST64(0x6c0603e6b3b7c1ca),
+    CONST64(0xf6fae5c07d3274cd), CONST64(0xb40a042bd4d8425e),
+    CONST64(0x731b26172ee619eb), CONST64(0x31ebc7fc870c2f78),
+    CONST64(0xbfc9838573709812), CONST64(0xfd39626eda9aae81),
+    CONST64(0x3a28405220a4f534), CONST64(0x78d8a1b9894ec3a7),
+    CONST64(0x649c294a61b7ad73), CONST64(0x266cc8a1c85d9be0),
+    CONST64(0xe17dea9d3263c055), CONST64(0xa38d0b769b89f6c6),
+    CONST64(0x2daf4f0f6ff541ac), CONST64(0x6f5faee4c61f773f),
+    CONST64(0xa84e8cd83c212c8a), CONST64(0xeabe6d3395cb1a19),
+    CONST64(0x90c79d3fedd3f122), CONST64(0xd2377cd44439c7b1),
+    CONST64(0x15265ee8be079c04), CONST64(0x57d6bf0317edaa97),
+    CONST64(0xd9f4fb7ae3911dfd), CONST64(0x9b041a914a7b2b6e),
+    CONST64(0x5c1538adb04570db), CONST64(0x1ee5d94619af4648),
+    CONST64(0x02a151b5f156289c), CONST64(0x4051b05e58bc1e0f),
+    CONST64(0x87409262a28245ba), CONST64(0xc5b073890b687329),
+    CONST64(0x4b9237f0ff14c443), CONST64(0x0962d61b56fef2d0),
+    CONST64(0xce73f427acc0a965), CONST64(0x8c8315cc052a9ff6),
+    CONST64(0x3a80143f5cf17f13), CONST64(0x7870f5d4f51b4980),
+    CONST64(0xbf61d7e80f251235), CONST64(0xfd913603a6cf24a6),
+    CONST64(0x73b3727a52b393cc), CONST64(0x31439391fb59a55f),
+    CONST64(0xf652b1ad0167feea), CONST64(0xb4a25046a88dc879),
+    CONST64(0xa8e6d8b54074a6ad), CONST64(0xea16395ee99e903e),
+    CONST64(0x2d071b6213a0cb8b), CONST64(0x6ff7fa89ba4afd18),
+    CONST64(0xe1d5bef04e364a72), CONST64(0xa3255f1be7dc7ce1),
+    CONST64(0x64347d271de22754), CONST64(0x26c49cccb40811c7),
+    CONST64(0x5cbd6cc0cc10fafc), CONST64(0x1e4d8d2b65facc6f),
+    CONST64(0xd95caf179fc497da), CONST64(0x9bac4efc362ea149),
+    CONST64(0x158e0a85c2521623), CONST64(0x577eeb6e6bb820b0),
+    CONST64(0x906fc95291867b05), CONST64(0xd29f28b9386c4d96),
+    CONST64(0xcedba04ad0952342), CONST64(0x8c2b41a1797f15d1),
+    CONST64(0x4b3a639d83414e64), CONST64(0x09ca82762aab78f7),
+    CONST64(0x87e8c60fded7cf9d), CONST64(0xc51827e4773df90e),
+    CONST64(0x020905d88d03a2bb), CONST64(0x40f9e43324e99428),
+    CONST64(0x2cffe7d5975e55e2), CONST64(0x6e0f063e3eb46371),
+    CONST64(0xa91e2402c48a38c4), CONST64(0xebeec5e96d600e57),
+    CONST64(0x65cc8190991cb93d), CONST64(0x273c607b30f68fae),
+    CONST64(0xe02d4247cac8d41b), CONST64(0xa2dda3ac6322e288),
+    CONST64(0xbe992b5f8bdb8c5c), CONST64(0xfc69cab42231bacf),
+    CONST64(0x3b78e888d80fe17a), CONST64(0x7988096371e5d7e9),
+    CONST64(0xf7aa4d1a85996083), CONST64(0xb55aacf12c735610),
+    CONST64(0x724b8ecdd64d0da5), CONST64(0x30bb6f267fa73b36),
+    CONST64(0x4ac29f2a07bfd00d), CONST64(0x08327ec1ae55e69e),
+    CONST64(0xcf235cfd546bbd2b), CONST64(0x8dd3bd16fd818bb8),
+    CONST64(0x03f1f96f09fd3cd2), CONST64(0x41011884a0170a41),
+    CONST64(0x86103ab85a2951f4), CONST64(0xc4e0db53f3c36767),
+    CONST64(0xd8a453a01b3a09b3), CONST64(0x9a54b24bb2d03f20),
+    CONST64(0x5d45907748ee6495), CONST64(0x1fb5719ce1045206),
+    CONST64(0x919735e51578e56c), CONST64(0xd367d40ebc92d3ff),
+    CONST64(0x1476f63246ac884a), CONST64(0x568617d9ef46bed9),
+    CONST64(0xe085162ab69d5e3c), CONST64(0xa275f7c11f7768af),
+    CONST64(0x6564d5fde549331a), CONST64(0x279434164ca30589),
+    CONST64(0xa9b6706fb8dfb2e3), CONST64(0xeb46918411358470),
+    CONST64(0x2c57b3b8eb0bdfc5), CONST64(0x6ea7525342e1e956),
+    CONST64(0x72e3daa0aa188782), CONST64(0x30133b4b03f2b111),
+    CONST64(0xf7021977f9cceaa4), CONST64(0xb5f2f89c5026dc37),
+    CONST64(0x3bd0bce5a45a6b5d), CONST64(0x79205d0e0db05dce),
+    CONST64(0xbe317f32f78e067b), CONST64(0xfcc19ed95e6430e8),
+    CONST64(0x86b86ed5267cdbd3), CONST64(0xc4488f3e8f96ed40),
+    CONST64(0x0359ad0275a8b6f5), CONST64(0x41a94ce9dc428066),
+    CONST64(0xcf8b0890283e370c), CONST64(0x8d7be97b81d4019f),
+    CONST64(0x4a6acb477bea5a2a), CONST64(0x089a2aacd2006cb9),
+    CONST64(0x14dea25f3af9026d), CONST64(0x562e43b4931334fe),
+    CONST64(0x913f6188692d6f4b), CONST64(0xd3cf8063c0c759d8),
+    CONST64(0x5dedc41a34bbeeb2), CONST64(0x1f1d25f19d51d821),
+    CONST64(0xd80c07cd676f8394), CONST64(0x9afce626ce85b507)
+};
+
+
+uint64_t cdb_crc64(const void *buf, uint32_t len)
+{
+    uint32_t i;
+    uint64_t crc = 0xFFFFFFFFFFFFFFFF;
+    uint8_t *cbuf = (uint8_t *)buf;
+
+    for (i = 0; i < len; i++) {
+        crc = CRC64_Table[(uint8_t)(crc >> 56) ^ *cbuf++] ^ (crc << 8);
+    }
+    return crc;
+} 
+
diff --git a/libdap-cuttdb/src/cdb_crc64.h b/libdap-cuttdb/src/cdb_crc64.h
new file mode 100644
index 0000000000000000000000000000000000000000..50744fc844afe84cdcef8ddba5f6cff81ae5599a
--- /dev/null
+++ b/libdap-cuttdb/src/cdb_crc64.h
@@ -0,0 +1,22 @@
+/*
+ *   CuttDB - a fast key-value storage engine
+ *
+ *
+ *   http://code.google.com/p/cuttdb/
+ *   
+ *   Copyright (c) 2012, Siyuan Fu.  All rights reserved.
+ *   Use and distribution licensed under the BSD license. 
+ *   See the LICENSE file for full text
+ *
+ *   Author: Siyuan Fu <fusiyuan2010@gmail.com>
+ *
+ */
+
+
+#ifndef _CDB_CRC64_H_
+#define _CDB_CRC64_H_
+#include <stdint.h>
+
+uint64_t cdb_crc64(const void *buf, uint32_t len);
+
+#endif
diff --git a/libdap-cuttdb/src/cdb_dumpdb.c b/libdap-cuttdb/src/cdb_dumpdb.c
new file mode 100644
index 0000000000000000000000000000000000000000..99cddbb5355cb44d235ef08b27234a7199c2fe67
--- /dev/null
+++ b/libdap-cuttdb/src/cdb_dumpdb.c
@@ -0,0 +1,68 @@
+/*
+ *   CuttDB - a fast key-value storage engine
+ *
+ *
+ *   http://code.google.com/p/cuttdb/
+ *   
+ *   Copyright (c) 2012, Siyuan Fu.  All rights reserved.
+ *   Use and distribution licensed under the BSD license. 
+ *   See the LICENSE file for full text
+ *
+ *   Author: Siyuan Fu <fusiyuan2010@gmail.com>
+ *
+ */
+
+#include "cuttdb.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include <stdbool.h>
+
+
+bool itcb(void *arg, const char *key, int ksize, const char *val, int vsize, uint32_t expire, uint64_t oid)
+{
+#define SBUFSIZE 4096
+    char buf[SBUFSIZE];
+    char *kvbuf = buf;
+    if (ksize + vsize + 2 > SBUFSIZE)
+        kvbuf = (char*)malloc(ksize + vsize + 2);
+    memcpy(kvbuf, key, ksize);
+    kvbuf[ksize] = '\t';
+    memcpy(kvbuf + ksize + 1, val, vsize);
+    kvbuf[ksize + vsize + 1] = '\0';
+    printf("%s\t%u\n", kvbuf, expire);
+    if (kvbuf != buf)
+        free(kvbuf);
+    return true;
+}
+
+int main(int argc, char *argv[])
+{
+    /* 1TB */
+    int cache_limit = 1048576;
+
+    if (argc < 2) {
+        fprintf(stderr, "Usage: %s dbpath [cachelimit(MB)].... \n", argv[0]);
+        return -1;
+    }
+    if (argc > 2) {
+        cache_limit = atoi(argv[2]);
+    }
+    
+    CDB *db = cdb_new();
+    cdb_option(db, 0, 0, cache_limit);
+    if (cdb_open(db, argv[1], CDB_PAGEWARMUP) < 0) {
+        fprintf(stderr, "Database open error, unable to recovery\n");
+        return -1;
+    }
+    void *it = cdb_iterate_new(db, 0);
+    cdb_iterate(db, itcb, NULL, it);
+    cdb_iterate_destroy(db, it);
+    cdb_destroy(db);
+}
+
+
+
+
+
diff --git a/libdap-cuttdb/src/cdb_dumpraw.c b/libdap-cuttdb/src/cdb_dumpraw.c
new file mode 100644
index 0000000000000000000000000000000000000000..53bbe11c6e7bec1723c97fb951b63468889d555a
--- /dev/null
+++ b/libdap-cuttdb/src/cdb_dumpraw.c
@@ -0,0 +1,115 @@
+/*
+ *   CuttDB - a fast key-value storage engine
+ *
+ *
+ *   http://code.google.com/p/cuttdb/
+ *   
+ *   Copyright (c) 2012, Siyuan Fu.  All rights reserved.
+ *   Use and distribution licensed under the BSD license. 
+ *   See the LICENSE file for full text
+ *
+ *   Author: Siyuan Fu <fusiyuan2010@gmail.com>
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <stdint.h>
+
+#define SI4 4
+#define SI8 8
+
+/* data record */
+typedef struct {
+    /* disk store starts at following field */
+    uint32_t magic;
+    uint32_t ksize;
+    uint32_t vsize;
+    uint32_t expire;
+    uint64_t oid;
+    char buf[0];
+} __attribute__((packed)) CDBREC;
+
+/* real size of a record header when stored on disk */
+#define RECHSIZE (SI4 * 4 + SI8)
+/* real size of a record when stored on disk */
+#define RECSIZE(r) (RECHSIZE + (r)->ksize + (r)->vsize)
+
+#define FILEMETASIZE 64
+#define ALIGNBYTES 16
+#define RECMAGIC 0x19871022
+#define DELRECMAGIC 0x19871023
+#define FILEMAGICHEADER "CuTtDbFiLePaRtIaL"
+#define FILEMAGICLEN (strlen(FILEMAGICHEADER))
+#define OFFALIGNED(off) (((off) & (ALIGNBYTES - 1))? ((off) | (ALIGNBYTES - 1)) + 1: off)
+
+
+
+void process(const char *filename)
+{
+#define SBUFSIZE 4096
+    int fd = open(filename, O_RDONLY, 0644);
+    char buf[SBUFSIZE];
+    if (fd < 0)
+        fprintf(stderr, "%s Open failed\n", filename);
+
+    long filesize = lseek(fd, 0, SEEK_END);
+    long pos = FILEMETASIZE;
+    char *map = (char*)mmap(NULL, filesize, PROT_READ, MAP_SHARED, fd, 0);
+    if (memcmp(map, FILEMAGICHEADER, FILEMAGICLEN)) {
+        fprintf(stderr, "%s is not a cuttdb file\n", filename);
+        close(fd);
+        return;
+    }
+
+    while(pos < filesize) {
+        char *kvbuf = buf;
+        CDBREC *rec = (CDBREC*)&map[pos];
+        if (rec->magic != RECMAGIC && rec->magic != DELRECMAGIC) {
+            pos += ALIGNBYTES;
+            continue;
+        }
+
+        pos += OFFALIGNED(RECSIZE(rec));
+        if (rec->magic != RECMAGIC)
+            continue;
+        
+        if (rec->ksize + rec->vsize + 2 > SBUFSIZE) {
+            kvbuf = (char*)malloc(rec->ksize + rec->vsize + 2);
+        }
+        memcpy(kvbuf, rec->buf, rec->ksize);
+        kvbuf[rec->ksize] = '\t';
+        memcpy(kvbuf + rec->ksize + 1, rec->buf + rec->ksize, rec->vsize);
+        kvbuf[rec->ksize + rec->vsize + 1] = '\0';
+        printf("%s\t%u\n", kvbuf, rec->expire);
+        if (kvbuf != buf)
+            free(kvbuf);
+    }
+
+    munmap(map, filesize);
+    close(fd);
+}
+
+
+
+
+int main(int argc, char *argv[])
+{
+    if (argc < 2) {
+        fprintf(stderr, "Usage: %s dat########.cdb dat########.cdb .... \n", argv[0]);
+        return 0;
+    }
+    for(int i = 1; i < argc; i++)
+        process(argv[i]);
+    return 0;
+}
+
+
+
+
diff --git a/libdap-cuttdb/src/cdb_errno.c b/libdap-cuttdb/src/cdb_errno.c
new file mode 100644
index 0000000000000000000000000000000000000000..432d154ac4dba8d8a3879b16905ae69468c58094
--- /dev/null
+++ b/libdap-cuttdb/src/cdb_errno.c
@@ -0,0 +1,78 @@
+/*
+ *   CuttDB - a fast key-value storage engine
+ *
+ *
+ *   http://code.google.com/p/cuttdb/
+ *   
+ *   Copyright (c) 2012, Siyuan Fu.  All rights reserved.
+ *   Use and distribution licensed under the BSD license. 
+ *   See the LICENSE file for full text
+ *
+ *   Author: Siyuan Fu <fusiyuan2010@gmail.com>
+ *
+ */
+
+
+#include "cuttdb.h"
+#include "cdb_errno.h"
+#include "cdb_types.h"
+#include "cdb_core.h"
+#include <pthread.h>
+
+
+int cdb_errno(CDB *db)
+{
+    return (long)pthread_getspecific(*(pthread_key_t*)db->errkey);
+}
+
+const char *cdb_errmsg(int ecode)
+{
+    switch(ecode) {
+        case CDB_SUCCESS:
+            return "Success";
+        case CDB_NOTFOUND:
+            return "Key Not Found";
+        case CDB_EXIST:
+            return "Item Already Exists";
+        case CDB_DIRNOEXIST:
+            return "Path Open Failed";
+        case CDB_OPENERR:
+            return "File Open Failed";
+        case CDB_PIDEXIST:
+            return "Opened By Another Process";
+        case CDB_DATAERRDAT:
+            return "Data File Content Error";
+        case CDB_DATAERRIDX:
+            return "Index File Content Error";
+        case CDB_WRITEERR:
+            return "Write To File Error";
+        case CDB_READERR:
+            return "Read From File Error";
+        case CDB_NOFID:
+            return "Internal File Lost";
+        case CDB_INTERNALERR:
+            return "Internal Error";
+        case CDB_DATAERRMETA:
+            return "File Header Error";
+        case CDB_MEMDBNOCACHE:
+            return "MemDB Mode With Zero Record Cache Size";
+        default:
+            return "Error For Errno";
+    }
+}
+
+
+void cdb_seterrcb(CDB *db, CDB_ERRCALLBACK errcb, void *arg)
+{
+    db->errcb = errcb;
+    db->errcbarg = arg;
+}
+
+
+void cdb_seterrno(CDB *db, int ecode, const char *source, int line)
+{
+    pthread_setspecific(*(pthread_key_t*)db->errkey, (void*)(long)ecode);
+    if (ecode != CDB_SUCCESS && db->errcb) {
+        db->errcb(db->errcbarg, ecode, source, line);
+    }
+}
diff --git a/libdap-cuttdb/src/cdb_errno.h b/libdap-cuttdb/src/cdb_errno.h
new file mode 100644
index 0000000000000000000000000000000000000000..f274819de73b2133d2648aa6490ea8f5cf66b41c
--- /dev/null
+++ b/libdap-cuttdb/src/cdb_errno.h
@@ -0,0 +1,22 @@
+/*
+ *   CuttDB - a fast key-value storage engine
+ *
+ *
+ *   http://code.google.com/p/cuttdb/
+ *   
+ *   Copyright (c) 2012, Siyuan Fu.  All rights reserved.
+ *   Use and distribution licensed under the BSD license. 
+ *   See the LICENSE file for full text
+ *
+ *   Author: Siyuan Fu <fusiyuan2010@gmail.com>
+ *
+ */
+
+
+#ifndef _CDB_ERRNO_H_
+#define _CDB_ERRNO_H_
+
+void cdb_seterrno(CDB *db, int ecode, const char *source, int line);
+
+#endif
+
diff --git a/libdap-cuttdb/src/cdb_hashtable.c b/libdap-cuttdb/src/cdb_hashtable.c
new file mode 100644
index 0000000000000000000000000000000000000000..f8746a681197799f797a18789b41387a77a6bc83
--- /dev/null
+++ b/libdap-cuttdb/src/cdb_hashtable.c
@@ -0,0 +1,539 @@
+/*
+ *   CuttDB - a fast key-value storage engine
+ *
+ *
+ *   http://code.google.com/p/cuttdb/
+ *   
+ *   Copyright (c) 2012, Siyuan Fu.  All rights reserved.
+ *   Use and distribution licensed under the BSD license. 
+ *   See the LICENSE file for full text
+ *
+ *   Author: Siyuan Fu <fusiyuan2010@gmail.com>
+ *
+ */
+
+
+#include "cdb_hashtable.h"
+#include <stdlib.h>
+#include <string.h>
+
+/*
+#define LRUPREV(i) (*(CDBHTITEM**)&((i)->buf[0]))
+#define LRUNEXT(i) (*(CDBHTITEM**)&((i)->buf[sizeof(void*)]))
+*/
+
+#define LRUPREV(i) ((i)->lruptr[0])
+#define LRUNEXT(i) ((i)->lruptr[1])
+
+static uint32_t MurmurHash1( const void * key, int len)
+{
+    const unsigned int m = 0xc6a4a793;
+    const int r = 16;
+    unsigned int h = 0x19900917 ^ (len * m);
+    const unsigned char * data = (const unsigned char *)key;
+
+    while(len >= 4)
+    {
+        unsigned int k = *(unsigned int *)data;
+        h += k; h *= m; h ^= h >> 16;
+        data += 4; len -= 4;
+    }
+
+    switch(len)
+    {
+    case 3:
+        h += data[2] << 16;
+    case 2:
+        h += data[1] << 8;
+    case 1:
+        h += data[0];
+        h *= m;
+        h ^= h >> r;
+    };
+
+    h *= m; h ^= h >> 10;
+    h *= m; h ^= h >> 17;
+    return h;
+} 
+
+void *cdb_ht_itemkey(CDBHASHTABLE *ht, CDBHTITEM *item)
+{
+    return (void *)(item->buf + ht->lru * 2 * sizeof(void*));
+}
+
+void *cdb_ht_itemval(CDBHASHTABLE *ht, CDBHTITEM *item)
+{
+    return (void *)(item->buf + ht->lru * 2 * sizeof(void*) + item->ksize);
+}
+
+CDBHASHTABLE *cdb_ht_new(bool lru, CDBHASHFUNC hashfunc)
+{
+    CDBHASHTABLE *ht;
+
+    ht = (CDBHASHTABLE*)malloc(sizeof(CDBHASHTABLE));
+    ht->hash = NULL;
+    ht->lru = lru;
+    ht->num = ht->size = 0;
+    ht->tail = ht->head = NULL;
+    for(uint32_t i = 0; i < (1<<CDBHTBNUMPOW); i++) {
+        CDBHTBUCKET *bucket = &(ht->buckets[i]);
+        bucket->bnum = 2;
+        uint32_t lsize = sizeof(CDBHTITEM *) * bucket->bnum;
+        bucket->rnum = 0;
+        bucket->items = (CDBHTITEM **)malloc(lsize);
+        ht->size += lsize;
+        memset(bucket->items, 0, lsize);
+    }
+    ht->hash = hashfunc;
+    if (ht->hash == NULL)
+        ht->hash = MurmurHash1;
+
+    ht->size += sizeof(CDBHASHTABLE);
+
+    return ht;
+}
+
+CDBHTITEM *cdb_ht_newitem(CDBHASHTABLE *ht, int ksize, int vsize)
+{
+    CDBHTITEM *item;
+    int hsize;
+
+    if (ht->lru)
+        hsize = sizeof(CDBHTITEM) + 2 * sizeof(void*);
+    else
+        hsize = sizeof(CDBHTITEM);
+
+    item = (CDBHTITEM*)malloc(hsize + ksize + vsize);
+    item->ksize = ksize;
+    item->vsize = vsize;
+    if (ht->lru) {
+        LRUPREV(item) = NULL;
+        LRUNEXT(item) = NULL;
+    }
+    return item;
+}
+
+
+
+
+void cdb_ht_insert(CDBHASHTABLE *ht, CDBHTITEM *item)
+{
+    uint32_t bid, hid;
+    CDBHTBUCKET *bucket;
+
+    item->hash = ht->hash(cdb_ht_itemkey(ht, item), item->ksize);
+    bid = item->hash & ((1<<CDBHTBNUMPOW)-1);
+    bucket = &(ht->buckets[bid]);
+    hid = (item->hash >> CDBHTBNUMPOW) & (bucket->bnum-1);
+
+    if (bucket->rnum > bucket->bnum * 2) {
+        CDBHTITEM **ilist;
+        uint32_t exp = 2;
+        if (bucket->bnum < 512) 
+            exp = 4;
+        int listsize = (bucket->bnum * exp) * sizeof(CDBHTITEM*);
+        ilist = (CDBHTITEM**)malloc(listsize);
+        memset(ilist, 0, listsize);
+        for(uint32_t i = 0; i < bucket->bnum; i++) {
+            CDBHTITEM *curitem = bucket->items[i];
+            while(curitem != NULL) {
+                CDBHTITEM *nextitem = curitem->hnext;
+                uint32_t hid = (curitem->hash>>CDBHTBNUMPOW)
+                    & (bucket->bnum * exp - 1);
+                curitem->hnext = ilist[hid];
+                ilist[hid] = curitem;
+                curitem = nextitem;
+            }
+        }
+        free(bucket->items);
+        bucket->items = ilist;
+        ht->size += listsize - bucket->bnum * sizeof(CDBHTITEM *);
+        bucket->bnum *= exp;
+        hid = (item->hash >> CDBHTBNUMPOW) & (bucket->bnum - 1);
+    }
+
+    {
+        CDBHTITEM *curitem = bucket->items[hid];
+        CDBHTITEM *preitem = NULL;
+        while(curitem != NULL) {
+            if (curitem->hash == item->hash
+                && curitem->ksize == item->ksize
+                && memcmp(cdb_ht_itemkey(ht, curitem),
+                cdb_ht_itemkey(ht, item) ,curitem->ksize) == 0) {
+                    CDBHTITEM *tmp;
+                    if (ht->lru) {
+                        if (LRUPREV(curitem))
+                            LRUNEXT(LRUPREV(curitem)) = LRUNEXT(curitem);
+                        if (LRUNEXT(curitem))
+                            LRUPREV(LRUNEXT(curitem)) = LRUPREV(curitem);
+                        if (ht->head == curitem)
+                            ht->head = LRUNEXT(curitem);
+                        if (ht->tail == curitem) 
+                            ht->tail = LRUPREV(curitem);
+                    }
+                    if (preitem)
+                        preitem->hnext = curitem->hnext;
+                    else
+                        bucket->items[hid] = curitem->hnext;
+                    tmp = curitem->hnext;
+                    ht->size -= sizeof(CDBHTITEM) + curitem->ksize + curitem->vsize
+                        + (ht->lru > 0) * sizeof(CDBHTITEM*) * 2;
+                    ht->num--;
+                    bucket->rnum--;
+                    free(curitem);
+                    curitem = tmp;
+                    break;
+            }
+            preitem = curitem;
+            curitem = curitem->hnext;
+        }
+    }
+
+    item->hnext = bucket->items[hid];
+    bucket->items[hid] = item;
+
+    if (ht->lru) {
+        if (ht->head) LRUPREV(ht->head) = item;
+        LRUPREV(item) = NULL;
+        LRUNEXT(item) = ht->head;
+        ht->head = item;
+        if (ht->tail == NULL)
+            ht->tail = item;
+    }
+
+    bucket->rnum++;
+    ht->num++;
+    ht->size += sizeof(CDBHTITEM) + item->ksize + item->vsize
+        + ht->lru * sizeof(CDBHTITEM*) * 2;
+}
+
+
+void *cdb_ht_insert2(CDBHASHTABLE *ht, const void *key, int ksize, const void *val, int vsize)
+{
+    CDBHTITEM *item;
+
+    item = cdb_ht_newitem(ht, ksize, vsize);
+    memcpy(cdb_ht_itemkey(ht, item), key, ksize);
+    memcpy(cdb_ht_itemval(ht, item), val, vsize);
+    cdb_ht_insert(ht, item);
+    return cdb_ht_itemval(ht, item);
+}
+
+void *cdb_ht_get(CDBHASHTABLE *ht, const void *key, int ksize, int *vsize, bool mtf)
+{
+    CDBHTITEM *res;
+
+    res = cdb_ht_get3(ht, key, ksize, mtf);
+    if (res) {
+        *vsize = res->vsize;
+        return cdb_ht_itemval(ht, res);
+    } else { 
+        *vsize = 0;
+        return NULL;
+    }
+}
+
+
+void *cdb_ht_get2(CDBHASHTABLE *ht, const void *key, int ksize, bool mtf)
+{
+    CDBHTITEM *res;
+
+    res = cdb_ht_get3(ht, key, ksize, mtf);
+    if (res)
+        return cdb_ht_itemval(ht, res);
+    else
+        return NULL;
+}
+
+
+CDBHTITEM *cdb_ht_get3(CDBHASHTABLE *ht, const void *key, int ksize, bool mtf)
+{
+    uint32_t hash, bid, hid;
+    CDBHTBUCKET *bucket;
+    CDBHTITEM *curitem;
+
+    hash = ht->hash(key, ksize);
+    bid = hash & ((1<<CDBHTBNUMPOW)-1);
+    bucket = &(ht->buckets[bid]);
+    hid = (hash >> CDBHTBNUMPOW) & (bucket->bnum - 1);
+
+    curitem = bucket->items[hid];
+    while (curitem != NULL) {
+        if (curitem->hash == hash
+            && curitem->ksize == ksize
+            && memcmp(cdb_ht_itemkey(ht, curitem), key , ksize) == 0) {
+                if (ht->lru && mtf && ht->head != curitem) {
+                    if (LRUPREV(curitem))
+                        LRUNEXT(LRUPREV(curitem)) = LRUNEXT(curitem);
+                    if (LRUNEXT(curitem))
+                        LRUPREV(LRUNEXT(curitem)) = LRUPREV(curitem);             
+                    if (ht->tail == curitem) 
+                        ht->tail = LRUPREV(curitem);
+
+                    LRUNEXT(curitem) = ht->head;
+                    LRUPREV(ht->head) = curitem;
+                    ht->head = curitem;
+                    LRUPREV(curitem) = NULL;
+                }
+                return curitem;
+        }
+        curitem = curitem->hnext;
+    }
+    return NULL;
+}
+
+
+bool cdb_ht_exist(CDBHASHTABLE *ht, const void *key, int ksize)
+{
+    int vsize;
+    return (cdb_ht_get(ht, key, ksize, &vsize, false) != NULL);
+}
+
+
+int cdb_ht_del2(CDBHASHTABLE *ht, const void *key, int ksize)
+{
+    CDBHTITEM *res = NULL;
+    res = cdb_ht_del(ht, key, ksize);
+    if (res) {
+        free(res);
+        return 0;
+    }
+    return -1;
+}
+
+
+CDBHTITEM *cdb_ht_del(CDBHASHTABLE *ht, const void *key, int ksize)
+{
+    uint32_t hash, bid, hid;
+    CDBHTBUCKET *bucket;
+    CDBHTITEM *curitem, *preitem;
+    CDBHTITEM *res = NULL;
+
+    hash = ht->hash(key, ksize);
+    bid = hash & ((1<<CDBHTBNUMPOW)-1);
+    bucket = &(ht->buckets[bid]);
+    hid = (hash >> CDBHTBNUMPOW) & (bucket->bnum - 1);
+
+    curitem = bucket->items[hid];
+    preitem = NULL;
+    while(curitem != NULL) {
+        if (curitem->hash == hash
+            && curitem->ksize == ksize
+            && memcmp(cdb_ht_itemkey(ht, curitem),
+            key, ksize) == 0) {
+            if (ht->lru) {
+                if (LRUPREV(curitem))
+                    LRUNEXT(LRUPREV(curitem)) = LRUNEXT(curitem);
+                if (LRUNEXT(curitem))
+                    LRUPREV(LRUNEXT(curitem)) = LRUPREV(curitem);
+                if (ht->head == curitem)
+                    ht->head = LRUNEXT(curitem);
+                if (ht->tail == curitem) 
+                    ht->tail = LRUPREV(curitem);
+            }
+            if (preitem)
+                preitem->hnext = curitem->hnext;
+            else
+                bucket->items[hid] = curitem->hnext;
+            ht->size -= sizeof(CDBHTITEM) + curitem->ksize + curitem->vsize
+                + (ht->lru > 0) * sizeof(CDBHTITEM*) * 2;
+            ht->num--;
+            bucket->rnum--;
+            res = curitem;
+            curitem = curitem->hnext;
+            break;
+        }
+        preitem = curitem;
+        curitem = curitem->hnext;
+    }
+
+    return res;
+}
+
+
+void cdb_ht_removetail(CDBHASHTABLE *ht)
+{
+    CDBHTITEM *item;
+
+    item = cdb_ht_poptail(ht);
+    if (item)
+        free(item);
+    return;
+}
+
+
+CDBHTITEM *cdb_ht_gettail(CDBHASHTABLE *ht)
+{
+    return ht->tail;
+}
+
+
+CDBHTITEM *cdb_ht_poptail(CDBHASHTABLE *ht)
+{
+    CDBHTITEM *item = ht->tail, *curitem, *preitem;;
+    CDBHTBUCKET *bucket;
+    uint32_t bid, hid;
+
+    if (!(ht->lru) || item == NULL)
+        return NULL;
+
+    bid = item->hash & ((1<<CDBHTBNUMPOW)-1);
+    bucket = &(ht->buckets[bid]);
+    hid = (item->hash >> CDBHTBNUMPOW) & (bucket->bnum - 1);
+
+    curitem = bucket->items[hid];
+    preitem = NULL;
+    while (curitem != NULL) {
+        if (curitem->hash == item->hash
+            && curitem->ksize == item->ksize
+            && memcmp(cdb_ht_itemkey(ht, curitem),
+            cdb_ht_itemkey(ht, item), item->ksize) == 0) {
+                if (preitem) {
+                    preitem->hnext = curitem->hnext;
+                } else {
+                    bucket->items[hid] = curitem->hnext;
+                }
+                break;   
+        }
+        preitem = curitem;
+        curitem = curitem->hnext;
+    }
+
+    if (LRUPREV(item))
+        LRUNEXT(LRUPREV(item)) = NULL;
+    if (ht->head == item)
+        ht->head = NULL;
+    ht->tail = LRUPREV(item);
+    bucket->rnum--;
+    ht->num--;
+    ht->size -= sizeof(CDBHTITEM) + item->ksize + item->vsize
+        + sizeof(CDBHTITEM*) * 2;
+    return item;
+}
+
+void cdb_ht_clean(CDBHASHTABLE *ht)
+{
+    for(uint32_t i = 0; i < (1<<CDBHTBNUMPOW); i++) {
+        CDBHTBUCKET *bucket = &(ht->buckets[i]);
+        for(uint32_t j = 0; j < bucket->bnum; j++) {
+            CDBHTITEM *curitem = bucket->items[j];
+            while(curitem != NULL) {
+                CDBHTITEM *tmp = curitem->hnext;
+                free(curitem);
+                curitem = tmp;
+            }
+            bucket->items[j] = NULL;
+        }
+        bucket->rnum = 0;
+    }
+    ht->num = 0;
+}
+
+
+void cdb_ht_destroy(CDBHASHTABLE *ht)
+{
+    if (ht->lru) {
+        CDBHTITEM *curitem = ht->head;
+        while(curitem) {
+            CDBHTITEM *nextitem = LRUNEXT(curitem);
+            free(curitem);
+            curitem = nextitem;
+        }
+    }
+
+    for(uint32_t i = 0; i < (1<<CDBHTBNUMPOW); i++) {
+        CDBHTBUCKET *bucket = &(ht->buckets[i]);
+
+        for(uint32_t j = 0; j < bucket->bnum && (!ht->lru); j++) {
+            CDBHTITEM *curitem = bucket->items[j];
+            while(curitem != NULL) {
+                CDBHTITEM *tmp = curitem->hnext;
+                free(curitem);
+                curitem = tmp;
+            }
+        }
+        free(bucket->items);
+    }
+    free(ht);
+}
+
+
+CDBHTITEM *cdb_ht_iterbegin(CDBHASHTABLE *ht)
+{
+    for(uint32_t i = 0; i < (1<<CDBHTBNUMPOW); i++) {
+        CDBHTBUCKET *bucket = &(ht->buckets[i]);
+        if (!bucket->rnum)
+            continue;
+        for(uint32_t j = 0; j < bucket->bnum; j++)
+            if (bucket->items[j])
+                return bucket->items[j];
+    }
+
+    return NULL;
+}
+
+
+CDBHTITEM *cdb_ht_iternext(CDBHASHTABLE *ht, CDBHTITEM *cur)
+{
+    if (cur == NULL)
+        return NULL;
+
+    if (cur->hnext)
+        return cur->hnext;
+
+    uint32_t bid = cur->hash & ((1<<CDBHTBNUMPOW)-1);
+    CDBHTBUCKET *bucket = &(ht->buckets[bid]);
+    uint32_t hid = (cur->hash >> CDBHTBNUMPOW) & (bucket->bnum - 1);
+
+    for(uint32_t i = hid + 1; i < bucket->bnum; i++) {
+        if (bucket->items[i])
+            return bucket->items[i];
+    }
+
+    for(uint32_t i = bid + 1; i < (1<<CDBHTBNUMPOW); i++) {
+        CDBHTBUCKET *bucket = &(ht->buckets[i]);
+        if (!bucket->rnum)
+            continue;
+        for(int j = 0; j < bucket->bnum; j++)
+            if (bucket->items[j])
+                return bucket->items[j];
+    }
+
+    return NULL;
+}
+
+
+#ifdef _UT_
+#include <stdio.h>
+#include <time.h>
+int main(int argc, char *argv[])
+{
+    CDBHASHTABLE *ht;
+    long k, v;
+    ht = cdb_ht_new(true, NULL);
+    for(int i = 0; i < 1000; i++) {
+        k = i;
+        v = i * 1000;
+        cdb_ht_insert2(ht, &k, sizeof(long), &v, sizeof(long));
+    }
+
+    srand(time(NULL));
+
+    for(int i = 0; i < 1000; i++) {
+        long *v, k = rand() % 1000;
+        int vsize;
+        v = (long*)cdb_ht_get(ht, &k, sizeof(long), &vsize, true);
+        printf("get: %ld -> %ld (%d)\n", k, *v, vsize);
+    }
+
+    printf("total size: %d  num: %d\n", ht->size, ht->num);
+
+    CDBHTITEM *item;
+    item = cdb_ht_poptail(ht);
+    printf("tail:  %ld - %ld\n", *(long*)cdb_ht_itemkey(ht, item), *(long*)cdb_ht_itemval(ht, item));
+    free(item);
+    item = cdb_ht_poptail(ht);
+    printf("tail:  %ld - %ld\n", *(long*)cdb_ht_itemkey(ht, item), *(long*)cdb_ht_itemval(ht, item));
+    free(item);
+}
+#endif
diff --git a/libdap-cuttdb/src/cdb_hashtable.h b/libdap-cuttdb/src/cdb_hashtable.h
new file mode 100644
index 0000000000000000000000000000000000000000..1f35b376dae7dd7618c24500dc729ab71577ad45
--- /dev/null
+++ b/libdap-cuttdb/src/cdb_hashtable.h
@@ -0,0 +1,139 @@
+/*
+ *   CuttDB - a fast key-value storage engine
+ *
+ *
+ *   http://code.google.com/p/cuttdb/
+ *   
+ *   Copyright (c) 2012, Siyuan Fu.  All rights reserved.
+ *   Use and distribution licensed under the BSD license. 
+ *   See the LICENSE file for full text
+ *
+ *   Author: Siyuan Fu <fusiyuan2010@gmail.com>
+ *
+ */
+
+
+#ifndef _CDB_HASHTABLE_H_
+#define _CDB_HASHTABLE_H_
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdbool.h>
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+typedef uint32_t (*CDBHASHFUNC)(const void *, int);
+
+/* default 1<<8 level-1 buckets, which makes the table expanding more smoothly */
+#define CDBHTBNUMPOW 8
+
+
+typedef struct CDBHTITEM
+{
+    int ksize;
+    int vsize;
+    uint32_t hash;
+    /* next element with the same hash */
+    struct CDBHTITEM *hnext;
+    /* if LRU is true, the first several bytes are two pointers of prev/next element */
+    struct CDBHTITEM *lruptr[0];
+    char buf[0];
+} __attribute__((packed)) CDBHTITEM;
+
+
+typedef struct {
+    /* array for items */
+    CDBHTITEM **items;
+    /* number of allocated slots in the bucket */
+    uint32_t bnum;
+    /* number of items exist in the bucket */
+    uint32_t rnum;
+} CDBHTBUCKET;
+
+
+typedef struct CDBHASHTABLE {
+    /* is in LRU mode? */
+    bool lru;
+    /* user specified hash function */
+    CDBHASHFUNC hash;
+    /* fixed number for level-1 buckets */
+    CDBHTBUCKET buckets[1<<CDBHTBNUMPOW];
+    /* memory usage */
+    uint64_t size;
+    /* number of items */
+    uint64_t num;
+    /* in LRU mode, the newest item */
+    CDBHTITEM *head;
+    /* in LRU mode, the oldest item */
+    CDBHTITEM *tail;
+} CDBHASHTABLE;
+
+
+/* get the pointer of key in current item */
+/* #define cdb_ht_itemkey(ht, item) (item->buf + ht->lru * 2 * sizeof(void*)) */
+void *cdb_ht_itemkey(CDBHASHTABLE *ht, CDBHTITEM *item);
+
+/* get the pointer of value in current item */
+/* #define cdb_ht_itemval(ht, item) (item->buf + ht->lru * 2 * sizeof(void*) + item->ksize) */
+void *cdb_ht_itemval(CDBHASHTABLE *ht, CDBHTITEM *item);
+
+/* create an hashtable, it can be a simple hashtable or with LeastRecentUse
+   The LRU mode needs extra two pointer space for every element
+   hash function can by specified by user */
+CDBHASHTABLE *cdb_ht_new(bool lru, CDBHASHFUNC hashfunc);
+
+/* clean and free the hastable */
+void cdb_ht_destroy(CDBHASHTABLE *ht);
+
+/* allocate a new item with specified size, but do not insert it into table */
+CDBHTITEM *cdb_ht_newitem(CDBHASHTABLE *ht, int ksize, int vsize);
+
+/* insert an item which already exists into table */
+void cdb_ht_insert(CDBHASHTABLE *ht, CDBHTITEM *item);
+
+/* allocate and insert an item into table by key and value, return the pointer of value in table */
+void *cdb_ht_insert2(CDBHASHTABLE *ht, const void *key, int ksize, const void *val, int vsize);
+
+/* get the value of an item and its size in table, move the item to front if mtf == true */
+void *cdb_ht_get(CDBHASHTABLE *ht, const void *key, int ksize, int *vsize, bool mtf);
+
+/* get the value of an item, assume the size is known, move the item to front if mtf == true */
+void *cdb_ht_get2(CDBHASHTABLE *ht, const void *key, int ksize, bool mtf);
+
+/* get the pointer of an item, it hasn't been copied */
+CDBHTITEM *cdb_ht_get3(CDBHASHTABLE *ht, const void *key, int ksize, bool mtf);
+
+/* check if an item with the key exists */
+bool cdb_ht_exist(CDBHASHTABLE *ht, const void *key, int ksize);
+
+/* delete and free an item from table by its key */
+int cdb_ht_del2(CDBHASHTABLE *ht, const void *key, int ksize);
+
+/* return and delete an item from table, the item should be freed by user */
+CDBHTITEM *cdb_ht_del(CDBHASHTABLE *ht, const void *key, int ksize);
+
+/* delete and free the last item in table */
+void cdb_ht_removetail(CDBHASHTABLE *ht);
+
+/* return last item in table, do not delete nor free */
+CDBHTITEM *cdb_ht_gettail(CDBHASHTABLE *ht);
+
+/* return last item in table, delete but should be freed by user */
+CDBHTITEM *cdb_ht_poptail(CDBHASHTABLE *ht);
+
+/* clean and free all elements in the table*/
+void cdb_ht_clean(CDBHASHTABLE *ht);
+
+/* iterate the table by get the front one firstly */
+CDBHTITEM *cdb_ht_iterbegin(CDBHASHTABLE *ht);
+
+/* get the next item of current element */
+CDBHTITEM *cdb_ht_iternext(CDBHASHTABLE *ht, CDBHTITEM *cur);
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif
+
diff --git a/libdap-cuttdb/src/cdb_lock.c b/libdap-cuttdb/src/cdb_lock.c
new file mode 100644
index 0000000000000000000000000000000000000000..54b91071cba0e9ac124c616a54b93fdbe2e29894
--- /dev/null
+++ b/libdap-cuttdb/src/cdb_lock.c
@@ -0,0 +1,75 @@
+/*
+ *   CuttDB - a fast key-value storage engine
+ *
+ *
+ *   http://code.google.com/p/cuttdb/
+ *   
+ *   Copyright (c) 2012, Siyuan Fu.  All rights reserved.
+ *   Use and distribution licensed under the BSD license. 
+ *   See the LICENSE file for full text
+ *
+ *   Author: Siyuan Fu <fusiyuan2010@gmail.com>
+ *
+ */
+
+
+#include "cdb_lock.h"
+#include <stdlib.h>
+#include <pthread.h>
+#include <sched.h>
+
+
+CDBLOCK *cdb_lock_new(int ltype)
+{
+    CDBLOCK *lock = NULL;
+    if (ltype == CDB_LOCKSPIN) {
+        lock = (CDBLOCK *)malloc(sizeof(CDBLOCK) + sizeof(pthread_spinlock_t));
+        pthread_spin_init((pthread_spinlock_t*)&lock->lock, PTHREAD_PROCESS_PRIVATE);
+    } else if (ltype == CDB_LOCKMUTEX) {
+        lock = (CDBLOCK *)malloc(sizeof(CDBLOCK) + sizeof(pthread_mutex_t));
+        pthread_mutex_init((pthread_mutex_t*)&lock->lock, NULL);
+    }
+    lock->ltype = ltype;
+
+    return lock;
+}
+
+
+void cdb_lock_lock(CDBLOCK *lock)
+{
+    if (lock->ltype == CDB_LOCKSPIN)
+        pthread_spin_lock((pthread_spinlock_t*)&lock->lock);
+    else if (lock->ltype == CDB_LOCKMUTEX)
+        pthread_mutex_lock((pthread_mutex_t*)&lock->lock);
+}
+
+
+void cdb_lock_unlock(CDBLOCK *lock)
+{
+    if (lock->ltype == CDB_LOCKSPIN)
+        pthread_spin_unlock((pthread_spinlock_t*)&lock->lock);
+    else if (lock->ltype == CDB_LOCKMUTEX)
+        pthread_mutex_unlock((pthread_mutex_t*)&lock->lock);
+}
+
+
+void cdb_lock_destory(CDBLOCK *lock)
+{
+    if (lock->ltype == CDB_LOCKSPIN)
+        pthread_spin_destroy((pthread_spinlock_t*)&lock->lock);
+    else if (lock->ltype == CDB_LOCKMUTEX)
+        pthread_mutex_destroy((pthread_mutex_t*)&lock->lock);
+
+    free(lock);
+}
+
+
+int cdb_lock_trylock(CDBLOCK *lock)
+{
+    if (lock->ltype == CDB_LOCKSPIN)
+        return pthread_spin_trylock((pthread_spinlock_t*)&lock->lock);
+    else if (lock->ltype == CDB_LOCKMUTEX)
+        return pthread_mutex_trylock((pthread_mutex_t*)&lock->lock);
+    return 0;
+}
+
diff --git a/libdap-cuttdb/src/cdb_lock.h b/libdap-cuttdb/src/cdb_lock.h
new file mode 100644
index 0000000000000000000000000000000000000000..587fcdb18b40722da27f0eebff9fdb0e05934ce3
--- /dev/null
+++ b/libdap-cuttdb/src/cdb_lock.h
@@ -0,0 +1,49 @@
+/*
+ *   CuttDB - a fast key-value storage engine
+ *
+ *
+ *   http://code.google.com/p/cuttdb/
+ *   
+ *   Copyright (c) 2012, Siyuan Fu.  All rights reserved.
+ *   Use and distribution licensed under the BSD license. 
+ *   See the LICENSE file for full text
+ *
+ *   Author: Siyuan Fu <fusiyuan2010@gmail.com>
+ *
+ */
+
+
+#ifndef _CDB_LOCK_H_
+#define _CDB_LOCK_H_
+
+
+enum {
+    /* spinlock */
+    CDB_LOCKSPIN,
+    /* mutex, which may cause OS context switch, mainly used in where Disk IO happens */
+    CDB_LOCKMUTEX,
+};
+
+/* may be used to indicated whether the area is protected */
+enum {
+    CDB_LOCKED,
+    CDB_NOTLOCKED,
+};
+
+typedef struct CDBLOCK
+{
+    int ltype;
+    char lock[0];
+} CDBLOCK;
+
+
+CDBLOCK *cdb_lock_new(int ltype);
+void cdb_lock_lock(CDBLOCK *lock);
+void cdb_lock_unlock(CDBLOCK *lock);
+void cdb_lock_destory(CDBLOCK *lock);
+int cdb_lock_trylock(CDBLOCK *lock);
+
+
+
+#endif
+
diff --git a/libdap-cuttdb/src/cdb_types.h b/libdap-cuttdb/src/cdb_types.h
new file mode 100644
index 0000000000000000000000000000000000000000..cfb6e6b8c7b7be2940d25e4ff9f8c098d3bc48c4
--- /dev/null
+++ b/libdap-cuttdb/src/cdb_types.h
@@ -0,0 +1,144 @@
+/*
+ *   CuttDB - a fast key-value storage engine
+ *
+ *
+ *   http://code.google.com/p/cuttdb/
+ *   
+ *   Copyright (c) 2012, Siyuan Fu.  All rights reserved.
+ *   Use and distribution licensed under the BSD license. 
+ *   See the LICENSE file for full text
+ *
+ *   Author: Siyuan Fu <fusiyuan2010@gmail.com>
+ *
+ */
+
+
+#ifndef _CDB_TYPES_H_
+#define _CDB_TYPES_H_
+#include <stdint.h>
+
+#define KB 1024
+#define MB 1048576
+#define CDBMIN(a, b) ((a)<(b)?(a):(b))
+#define CDBMAX(a, b) ((a)>(b)?(a):(b))
+
+#define SI8 8
+#define SI4 4
+/* space reserved in stack for i/o, avoid some malloc/free */
+#define SBUFSIZE (64 * KB)
+
+/* a default disk read size for index page, 3KB is enough(a page with 300 items) */
+#define PAGEAREADSIZE (3 * KB)
+
+/* reserved in stack for matched items in a hash index page */
+#define SFOFFNUM 8
+
+/* a valid virtual offset */
+#define OFFNOTNULL(o) (((o).i4)||((o).i2))
+/* a null virtual offset */
+#define OFFNULL(o) (((o).i4==0)&&((o).i2==0))
+/* nullify an offset  */
+#define OFFZERO(o) do{(o).i4=0;(o).i2=0;}while(0)
+/* offset is equal ? */
+#define OFFEQ(a,b) (((a).i4==(b).i4)&&((a).i2==(b).i2))
+/* hash in page is equal ? */
+#define PHASHEQ(a,b) (((a).i2==(b).i2)&&((a).i1==(b).i1))
+/* page size increment */
+#define CDB_PAGEINCR 4
+
+
+/* if page cache size exceeds the limit */
+#define PCOVERFLOW(db) ((db)->dpcache && (db)->dpcache->size + (db)->pcache->size > (db)->pclimit)
+/* if record cache size exceeds the limit */
+#define RCOVERFLOW(db) ((db)->rcache && (db)->rcache->size > (db)->rclimit)
+
+/* timeout for a dirty index page stays since last modify */
+#define DPAGETIMEOUT 40
+/* operation on main table are isolated by these locks */
+#define MLOCKNUM 256
+
+#define CDBHASH64(a, b) cdb_crc64(a, b) 
+
+/* all virtual offsets are 48-bits */
+typedef struct FOFF
+{
+    uint32_t i4;
+    uint16_t i2;
+} __attribute__((packed)) FOFF;
+
+
+
+#define SFOFF (sizeof(FOFF))
+
+
+/* all hash value in index page are 24-bits 
+    range 0..16M guarantee very low collision 
+    with less than a hundred records in a page */
+typedef struct PHASH
+{
+    uint16_t i2;
+    uint8_t i1;
+} __attribute__((packed)) PHASH;
+
+
+/* an item in index page contains a hash and an offset */
+typedef struct PITEM
+{
+    FOFF off;
+    PHASH hash;
+} __attribute__((packed)) PITEM;
+
+
+/* data record */
+typedef struct CDBREC{
+    /* where the data come from */
+    FOFF ooff;
+    uint32_t osize;
+
+    /* access convenient*/
+    void *key;
+    void *val;
+
+    /* disk store starts at following field */
+    uint32_t magic;
+    uint32_t ksize;
+    uint32_t vsize;
+    uint32_t expire;
+    uint64_t oid;
+    char buf[0];
+} __attribute__((packed)) CDBREC;
+
+/* real size of a record header when stored on disk */
+#define RECHSIZE (SI4 * 4 + SI8)
+/* real size of a record when stored on disk */
+#define RECSIZE(r) (RECHSIZE + (r)->ksize + (r)->vsize)
+
+
+/* index page */
+typedef struct CDBPAGE{
+    FOFF ooff;
+    uint32_t osize;
+    uint32_t cap;
+
+    union {
+        /* what it be on disk */
+        uint32_t magic;
+        /* what it be in memory */
+        uint32_t mtime;
+    };
+    /* which bucket it belongs to */
+    uint32_t bid;
+    uint32_t num;
+    uint64_t oid;
+    PITEM items[0];
+} __attribute__((packed)) CDBPAGE;
+
+/* real size of a page header when stored on disk */
+#define PAGEHSIZE (SI4 * 3 + SI8)
+/* real size of a page when stored on disk */
+#define PAGESIZE(p) (PAGEHSIZE + sizeof(PITEM) * (p)->num)
+/* in-memory size of an record structure */
+#define MPAGESIZE(p) (sizeof(CDBPAGE) + sizeof(PITEM) * (p)->cap)
+
+#endif
+
diff --git a/libdap-cuttdb/src/cdb_vio.c b/libdap-cuttdb/src/cdb_vio.c
new file mode 100644
index 0000000000000000000000000000000000000000..c0da6d1572812cbda4afe795cf24e25a733e375b
--- /dev/null
+++ b/libdap-cuttdb/src/cdb_vio.c
@@ -0,0 +1,42 @@
+/*
+ *   CuttDB - a fast key-value storage engine
+ *
+ *
+ *   http://code.google.com/p/cuttdb/
+ *   
+ *   Copyright (c) 2012, Siyuan Fu.  All rights reserved.
+ *   Use and distribution licensed under the BSD license. 
+ *   See the LICENSE file for full text
+ *
+ *   Author: Siyuan Fu <fusiyuan2010@gmail.com>
+ *
+ */
+
+
+#include "cdb_vio.h"
+#include "cdb_types.h"
+#include "vio_apnd2.h"
+#include "stdlib.h"
+
+
+CDBVIO *cdb_vio_new(int type)
+{
+    CDBVIO *res;
+    res = (CDBVIO *)malloc(sizeof(CDBVIO));
+    switch(type) {
+        case CDBVIOAPND2:
+            vio_apnd2_init(res);
+            break;
+        default:
+            vio_apnd2_init(res);
+            break;
+    }
+    return res;
+}
+
+int cdb_vio_destroy(CDBVIO *vio)
+{
+    free(vio);
+    return 0;
+}
+
diff --git a/libdap-cuttdb/src/cdb_vio.h b/libdap-cuttdb/src/cdb_vio.h
new file mode 100644
index 0000000000000000000000000000000000000000..5c6e7e205813f11d84d45f0fa8916cff1edc3bbe
--- /dev/null
+++ b/libdap-cuttdb/src/cdb_vio.h
@@ -0,0 +1,101 @@
+/*
+ *   CuttDB - a fast key-value storage engine
+ *
+ *
+ *   http://code.google.com/p/cuttdb/
+ *   
+ *   Copyright (c) 2012, Siyuan Fu.  All rights reserved.
+ *   Use and distribution licensed under the BSD license. 
+ *   See the LICENSE file for full text
+ *
+ *   Author: Siyuan Fu <fusiyuan2010@gmail.com>
+ *
+ */
+
+
+#ifndef _CDB_VIO_H_
+#define _CDB_VIO_H_
+#include "cdb_types.h"
+#include "cuttdb.h"
+#include <stdint.h>
+#include <stdbool.h>
+
+enum {
+    /* obsoleted */
+    CDBVIOAPPEND,
+    /* append only format storage */
+    CDBVIOAPND2,
+};
+
+typedef struct CDBVIO CDBVIO;
+
+/* write a record, returns virtual offset at 3rd parameter */
+typedef int (*VIOWRITEREC)(CDBVIO*, CDBREC*, FOFF*);
+/* delete a record, pass in the current offset at 3rd parameter */
+typedef int (*VIODELETEREC)(CDBVIO*, CDBREC*, FOFF);
+/* read a record, 2nd parameter default points to stack buffer, if its real size
+greater than the stack buffer size, it will be changed to points to a space in heap, 
+the last parameter decides whether read the whole record or just read key for comparsion */
+typedef int (*VIOREADREC)(CDBVIO*, CDBREC**, FOFF, bool);
+/* close the storage */
+typedef int (*VIOCLOSE)(CDBVIO*);
+/* open the storage, pass in the storage path and open mode */
+typedef int (*VIOOPEN)(CDBVIO*, const char*, int);
+/* write an index page, return its virtual offset at 3rd parameter */
+typedef int (*VIOWRITEPAGE)(CDBVIO*, CDBPAGE *, FOFF*);
+/* read an index page, 2nd parameter default points to stack buffer, if its real size
+greater than the stack buffer size, it will be changed to points to a space in heap */
+typedef int (*VIOREADPAGE)(CDBVIO*, CDBPAGE **, FOFF);
+/* make the storage do an sync operation */
+typedef int (*VIOSYNC)(CDBVIO*);
+/* write db header, which contains main-index */
+typedef int (*VIOWRITEHEAD)(CDBVIO*);
+/* read db header, which contains main-index */
+typedef int (*VIOREADHEAD)(CDBVIO*);
+/* tell that no dirty page exists */
+typedef void (*VIOCLEANPOINT)(CDBVIO*);
+/* get the record/page iterator at oid */
+typedef void* (*VIOITFIRST)(CDBVIO *, uint64_t oid);
+/* get the next index page by iterator */
+typedef int (*VIOPAGEITNEXT)(CDBVIO *, CDBPAGE **, void *);
+/* get the next record by iterator */
+typedef int (*VIORECITNEXT)(CDBVIO *, CDBREC **, void *);
+/* destroy and free the iterator */
+typedef void (*VIOITDESTROY)(CDBVIO *, void *);
+
+struct CDBVIO 
+{
+    VIOOPEN open;
+    VIOCLOSE close;
+
+    VIOWRITEREC wrec;
+    VIODELETEREC drec;
+    VIOREADREC rrec;
+
+    VIOWRITEPAGE wpage;
+    VIOREADPAGE rpage;
+
+    VIOSYNC sync;
+    VIOWRITEHEAD whead;
+    VIOREADHEAD rhead;
+    
+    VIOCLEANPOINT cleanpoint;
+
+    VIOITFIRST pageitfirst;
+    VIOPAGEITNEXT pageitnext;
+    VIOITDESTROY pageitdestroy;
+
+    VIOITFIRST recitfirst;
+    VIORECITNEXT recitnext;
+    VIOITDESTROY recitdestroy;
+
+    CDB *db;
+    void *iometa;
+};
+
+
+CDBVIO *cdb_vio_new(int type);
+int cdb_vio_destroy(CDBVIO *vio);
+
+
+#endif
diff --git a/libdap-cuttdb/src/cuttdb-server.c b/libdap-cuttdb/src/cuttdb-server.c
new file mode 100644
index 0000000000000000000000000000000000000000..9b09a2863a28eeb88b74eebd1f23ebf6f6bfaa71
--- /dev/null
+++ b/libdap-cuttdb/src/cuttdb-server.c
@@ -0,0 +1,2152 @@
+/*
+ *   CuttDB - a fast key-value storage engine
+ *
+ *
+ *   http://code.google.com/p/cuttdb/
+ *   
+ *   The server&network part of CuttDB is based on Beansdb:
+ *
+ *   http://beansdb.googlecode.com
+ *
+ *   Beansdb is most based on Memcachedb and Memcached:
+ *
+ *   http://memcachedb.org/
+ *   http://danga.com/memcached/
+ *
+ *   Copyright (c) 2012, Siyuan Fu.  All rights reserved.
+ *   Use and distribution licensed under the BSD license. 
+ *   See the LICENSE file for full text
+ *
+ *   Author: Siyuan Fu <fusiyuan2010@gmail.com>
+ *
+ */
+
+
+#include "cuttdb-server.h"
+#include "cuttdb.h"
+#include <sys/stat.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <signal.h>
+#include <sys/resource.h>
+#include <sys/uio.h>
+#include <unistd.h>
+
+/* need this to get IOV_MAX on some platforms. */
+#ifndef __need_IOV_MAX
+#define __need_IOV_MAX
+#endif
+#include <pwd.h>
+#include <sys/mman.h>
+#include <fcntl.h>
+#include <netinet/tcp.h>
+#include <arpa/inet.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <limits.h>
+#include <inttypes.h>
+#include <ctype.h>
+
+
+#ifdef HAVE_READPROC
+#include <proc/readproc.h>
+#endif
+
+#ifdef HAVE_MALLOC_H
+/* OpenBSD has a malloc.h, but warns to use stdlib.h instead */
+#ifndef __OpenBSD__
+#include <malloc.h>
+#endif
+#endif
+
+/* FreeBSD 4.x doesn't have IOV_MAX exposed. */
+#ifndef IOV_MAX
+#if defined(__FreeBSD__) || defined(__APPLE__)
+# define IOV_MAX 1024
+#endif
+#endif
+
+#ifndef IOV_MAX
+# define IOV_MAX 1024
+#endif
+
+#ifndef CLOCK_MONOTONIC
+#include "clock_gettime_stub.c"
+#endif
+
+/*
+ * forward declarations
+ */
+static int new_socket(struct addrinfo *ai);
+static int server_socket(const int port, const bool is_udp);
+static int try_read_command(conn *c);
+static int try_read_network(conn *c);
+
+/* stats */
+static void stats_reset(void);
+static void stats_init(void);
+
+/* defaults */
+static void settings_init(void);
+
+/* event handling, network IO */
+static void conn_close(conn *c);
+static void conn_init(void);
+static bool update_event(conn *c, const int new_flags);
+int delete_event(int fd);
+static void complete_nread(conn *c);
+static void process_command(conn *c, char *command);
+static int transmit(conn *c);
+static int ensure_iov_space(conn *c);
+static int add_iov(conn *c, const void *buf, int len);
+static int add_msghdr(conn *c);
+static void conn_free(conn *c);
+
+
+static size_t item_make_header(const uint8_t nkey, const int flags, const int nbytes,
+                     char *suffix, uint8_t *nsuffix);
+static int item_free(item *it);
+static item *item_get(char *key, size_t nkey);
+static item *item_alloc1(char *key, const size_t nkey, const int flags, const int nbytes);
+
+/** exported globals **/
+struct stats stats;
+struct settings settings;
+
+CDB *db = NULL;
+FILE   *access_log = NULL;
+int daemon_quit = 0;
+
+/** file scope variables **/
+static int stub_fd = 0;
+
+#define TRANSMIT_COMPLETE   0
+#define TRANSMIT_INCOMPLETE 1
+#define TRANSMIT_SOFT_ERROR 2
+#define TRANSMIT_HARD_ERROR 3
+
+
+void item_init(void) {
+    /*freeitemtotal = INIT_ITEM_FREELIST_LENGTH;
+    freeitemcurr  = 0;
+
+    freeitem = (item **)malloc( sizeof(item *) * freeitemtotal );
+    if (freeitem == NULL) {
+        perror("malloc()");
+    }*/
+    return;
+}
+
+static size_t item_make_header(const uint8_t nkey, const int flags, const int nbytes,
+                     char *suffix, uint8_t *nsuffix) {
+    /* suffix is defined at 40 chars elsewhere.. */
+    *nsuffix = (uint8_t) snprintf(suffix, 40, " %d %d\r\n", flags, nbytes - 2);
+    return sizeof(item) + nkey + *nsuffix + nbytes;
+}
+
+static int item_free(item *it)
+{
+    free(it);
+    return 0;
+}
+
+static item *item_get(char *key, size_t nkey)
+{
+    item *it = NULL;
+    int vlen;
+    uint32_t flag;
+    void *value;
+    int ret = cdb_get(db, key, nkey, &value, &vlen);
+    flag = 0;
+    if (ret == 0){
+        it = item_alloc1(key, nkey, flag, vlen + 2);
+        if (it){
+            memcpy(ITEM_data(it), value, vlen);
+            memcpy(ITEM_data(it) + vlen, "\r\n", 2);
+        }
+        cdb_free_val(&value);
+    }
+    return it;
+
+}
+
+static item *item_alloc1(char *key, const size_t nkey, const int flags, const int nbytes)
+{
+    uint8_t nsuffix;
+    item *it;
+    char suffix[40];
+    size_t ntotal = item_make_header(nkey + 1, flags, nbytes, suffix, &nsuffix);
+
+    it = (item *)malloc(ntotal);
+    if (it == NULL){
+        return NULL;
+    }
+    memset(it, 0, ntotal);
+
+    it->nkey = nkey;
+    it->nbytes = nbytes;
+    strcpy(ITEM_key(it), key);
+    memcpy(ITEM_suffix(it), suffix, (size_t)nsuffix);
+    it->nsuffix = nsuffix;
+    return it;
+}
+
+
+static void stats_init(void) {
+    stats.curr_conns = stats.total_conns = stats.conn_structs = 0;
+    stats.get_cmds = stats.set_cmds = stats.delete_cmds = 0;
+    stats.slow_cmds = stats.get_hits = stats.get_misses = 0;
+    stats.bytes_read = stats.bytes_written = 0;
+
+    /* make the time we started always be 2 seconds before we really
+       did, so time(0) - time.started is never zero.  if so, things
+       like 'settings.oldest_live' which act as booleans as well as
+       values are now false in boolean context... */
+    stats.started = time(0) - 2;
+}
+
+static void stats_reset(void) {
+    STATS_LOCK();
+    stats.total_conns = 0;
+    stats.get_cmds = stats.set_cmds = stats.delete_cmds = 0;
+    stats.slow_cmds = stats.get_hits = stats.get_misses = 0;
+    stats.bytes_read = stats.bytes_written = 0;
+    STATS_UNLOCK();
+}
+
+static void settings_init(void) {
+    settings.port = 8964;
+    /* By default this string should be NULL for getaddrinfo() */
+    settings.inter = NULL;
+    settings.item_buf_size = 4 * 1024;     /* default is 4KB */
+    settings.maxconns = 1024;         /* to limit connections-related memory to about 5MB */
+    settings.verbose = 0;
+    settings.num_threads = 16;
+    settings.flush_period = 1;  // 1 secs
+    settings.slow_cmd_time = 0.1; // 100ms
+}
+
+/*
+ * Adds a message header to a connection.
+ *
+ * Returns 0 on success, -1 on out-of-memory.
+ */
+static int add_msghdr(conn *c)
+{
+    struct msghdr *msg;
+
+    assert(c != NULL);
+
+    if (c->msgsize == c->msgused) {
+        msg = realloc(c->msglist, c->msgsize * 2 * sizeof(struct msghdr));
+        if (! msg)
+            return -1;
+        c->msglist = msg;
+        c->msgsize *= 2;
+    }
+
+    msg = c->msglist + c->msgused;
+
+    /* this wipes msg_iovlen, msg_control, msg_controllen, and
+       msg_flags, the last 3 of which aren't defined on solaris: */
+    memset(msg, 0, sizeof(struct msghdr));
+
+    msg->msg_iov = &c->iov[c->iovused];
+
+    c->msgbytes = 0;
+    c->msgused++;
+
+    return 0;
+}
+
+
+/*
+ * Free list management for connections.
+ */
+
+static conn **freeconns;
+static int freetotal;
+static int freecurr;
+
+
+static void conn_init(void) {
+    freetotal = 200;
+    freecurr = 0;
+    if ((freeconns = (conn **)malloc(sizeof(conn *) * freetotal)) == NULL) {
+        fprintf(stderr, "malloc()\n");
+    }
+    return;
+}
+
+/*
+ * Returns a connection from the freelist, if any. Should call this using
+ * conn_from_freelist() for thread safety.
+ */
+conn *do_conn_from_freelist() {
+    conn *c;
+
+    if (freecurr > 0) {
+        c = freeconns[--freecurr];
+    } else {
+        c = NULL;
+    }
+
+    return c;
+}
+
+/*
+ * Adds a connection to the freelist. 0 = success. Should call this using
+ * conn_add_to_freelist() for thread safety.
+ */
+bool do_conn_add_to_freelist(conn *c) {
+    if (freecurr < freetotal) {
+        freeconns[freecurr++] = c;
+        return false;
+    } else {
+        /* try to enlarge free connections array */
+        conn **new_freeconns = realloc(freeconns, sizeof(conn *) * freetotal * 2);
+        if (new_freeconns) {
+            freetotal *= 2;
+            freeconns = new_freeconns;
+            freeconns[freecurr++] = c;
+            return false;
+        }
+    }
+    return true;
+}
+
+conn *conn_new(const int sfd, const int init_state, const int read_buffer_size) {
+    conn *c = conn_from_freelist();
+
+    if (NULL == c) {
+        if (!(c = (conn *)calloc(1, sizeof(conn)))) {
+            fprintf(stderr, "calloc()\n");
+            return NULL;
+        }
+        c->rbuf = c->wbuf = 0;
+        c->ilist = 0;
+        c->iov = 0;
+        c->msglist = 0;
+
+        c->rsize = read_buffer_size;
+        c->wsize = DATA_BUFFER_SIZE;
+        c->isize = ITEM_LIST_INITIAL;
+        c->iovsize = IOV_LIST_INITIAL;
+        c->msgsize = MSG_LIST_INITIAL;
+
+        c->rbuf = (char *)malloc((size_t)c->rsize);
+        c->wbuf = (char *)malloc((size_t)c->wsize);
+        c->ilist = (item **)malloc(sizeof(item *) * c->isize);
+        c->iov = (struct iovec *)malloc(sizeof(struct iovec) * c->iovsize);
+        c->msglist = (struct msghdr *)malloc(sizeof(struct msghdr) * c->msgsize);
+
+        if (c->rbuf == 0 || c->wbuf == 0 || c->ilist == 0 || c->iov == 0 ||
+                c->msglist == 0) {
+            conn_free(c);
+            fprintf(stderr, "malloc()\n");
+            return NULL;
+        }
+
+        STATS_LOCK();
+        stats.conn_structs++;
+        STATS_UNLOCK();
+    }
+
+    if (settings.verbose > 1) {
+        if (init_state == conn_listening)
+            fprintf(stderr, "<%d server listening\n", sfd);
+        else
+            fprintf(stderr, "<%d new client connection\n", sfd);
+    }
+
+    c->sfd = sfd;
+    c->state = init_state;
+    c->rlbytes = 0;
+    c->rbytes = c->wbytes = 0;
+    c->wcurr = c->wbuf;
+    c->rcurr = c->rbuf;
+    c->ritem = 0;
+    c->icurr = c->ilist;
+    c->ileft = 0;
+    c->iovused = 0;
+    c->msgcurr = 0;
+    c->msgused = 0;
+
+    c->write_and_go = conn_read;
+    c->write_and_free = 0;
+    c->item = 0;
+    c->noreply = false;
+    
+    update_event(c, AE_READABLE);
+    if (add_event(sfd, AE_READABLE, c) == -1) {
+        if (conn_add_to_freelist(c)) {
+            conn_free(c);
+        }
+        perror("event_add");
+        return NULL;
+    }
+
+    STATS_LOCK();
+    stats.curr_conns++;
+    stats.total_conns++;
+    STATS_UNLOCK();
+
+    return c;
+}
+
+static void conn_cleanup(conn *c) {
+    assert(c != NULL);
+
+    if (c->item) {
+        item_free(c->item);
+        c->item = 0;
+    }
+
+    if (c->ileft != 0) {
+        for (; c->ileft > 0; c->ileft--,c->icurr++) {
+            item_free(*(c->icurr));
+        }
+    }
+
+    if (c->write_and_free) {
+        free(c->write_and_free);
+        c->write_and_free = 0;
+    }
+}
+
+/*
+ * Frees a connection.
+ */
+void conn_free(conn *c) {
+    if (c) {
+        if (c->msglist)
+            free(c->msglist);
+        if (c->rbuf)
+            free(c->rbuf);
+        if (c->wbuf)
+            free(c->wbuf);
+        if (c->ilist)
+            free(c->ilist);
+        if (c->iov)
+            free(c->iov);
+        free(c);
+    }
+}
+
+static void conn_close(conn *c) {
+    assert(c != NULL);
+
+    if (settings.verbose > 1)
+        fprintf(stderr, "<%d connection closed.\n", c->sfd);
+
+    delete_event(c->sfd);
+    close(c->sfd);
+    c->sfd = -1;
+    update_event(c, 0);
+    conn_cleanup(c);
+
+    /* if the connection has big buffers, just free it */
+    if (c->rsize > READ_BUFFER_HIGHWAT || conn_add_to_freelist(c)) {
+        conn_free(c);
+    }
+
+    STATS_LOCK();
+    stats.curr_conns--;
+    STATS_UNLOCK();
+
+    return;
+}
+
+
+/*
+ * Shrinks a connection's buffers if they're too big.  This prevents
+ * periodic large "get" requests from permanently chewing lots of server
+ * memory.
+ *
+ * This should only be called in between requests since it can wipe output
+ * buffers!
+ */
+static void conn_shrink(conn *c) {
+    assert(c != NULL);
+
+    if (c->rsize > READ_BUFFER_HIGHWAT && c->rbytes < DATA_BUFFER_SIZE) {
+        char *newbuf;
+
+        if (c->rcurr != c->rbuf)
+            memmove(c->rbuf, c->rcurr, (size_t)c->rbytes);
+
+        newbuf = (char *)realloc((void *)c->rbuf, DATA_BUFFER_SIZE);
+
+        if (newbuf) {
+            c->rbuf = newbuf;
+            c->rsize = DATA_BUFFER_SIZE;
+        }
+        /* TODO check other branch... */
+        c->rcurr = c->rbuf;
+    }
+
+    if (c->isize > ITEM_LIST_HIGHWAT) {
+        item **newbuf = (item**) realloc((void *)c->ilist, ITEM_LIST_INITIAL * sizeof(c->ilist[0]));
+        if (newbuf) {
+            c->ilist = newbuf;
+            c->isize = ITEM_LIST_INITIAL;
+        }
+    /* TODO check error condition? */
+    }
+
+    if (c->msgsize > MSG_LIST_HIGHWAT) {
+        struct msghdr *newbuf = (struct msghdr *) realloc((void *)c->msglist, MSG_LIST_INITIAL * sizeof(c->msglist[0]));
+        if (newbuf) {
+            c->msglist = newbuf;
+            c->msgsize = MSG_LIST_INITIAL;
+        }
+    /* TODO check error condition? */
+    }
+
+    if (c->iovsize > IOV_LIST_HIGHWAT) {
+        struct iovec *newbuf = (struct iovec *) realloc((void *)c->iov, IOV_LIST_INITIAL * sizeof(c->iov[0]));
+        if (newbuf) {
+            c->iov = newbuf;
+            c->iovsize = IOV_LIST_INITIAL;
+        }
+    /* TODO check return value */
+    }
+}
+
+/*
+ * Sets a connection's current state in the state machine. Any special
+ * processing that needs to happen on certain state transitions can
+ * happen here.
+ */
+static void conn_set_state(conn *c, int state) {
+    assert(c != NULL);
+
+    if (state != c->state) {
+        if (state == conn_read) {
+            conn_shrink(c);
+        }
+        c->state = state;
+    }
+}
+
+
+/*
+ * Ensures that there is room for another struct iovec in a connection's
+ * iov list.
+ *
+ * Returns 0 on success, -1 on out-of-memory.
+ */
+static int ensure_iov_space(conn *c) {
+    assert(c != NULL);
+
+    if (c->iovused >= c->iovsize) {
+        int i, iovnum;
+        struct iovec *new_iov = (struct iovec *)realloc(c->iov,
+                                (c->iovsize * 2) * sizeof(struct iovec));
+        if (! new_iov)
+            return -1;
+        c->iov = new_iov;
+        c->iovsize *= 2;
+
+        /* Point all the msghdr structures at the new list. */
+        for (i = 0, iovnum = 0; i < c->msgused; i++) {
+            c->msglist[i].msg_iov = &c->iov[iovnum];
+            iovnum += c->msglist[i].msg_iovlen;
+        }
+    }
+
+    return 0;
+}
+
+
+/*
+ * Adds data to the list of pending data that will be written out to a
+ * connection.
+ *
+ * Returns 0 on success, -1 on out-of-memory.
+ */
+
+static int add_iov(conn *c, const void *buf, int len) {
+    struct msghdr *m;
+    int leftover;
+    bool limit_to_mtu;
+
+    assert(c != NULL);
+
+    do {
+        m = &c->msglist[c->msgused - 1];
+
+        /*
+         * Limit the first payloads of TCP replies, to
+         * MAX_PAYLOAD_SIZE bytes.
+         */
+        limit_to_mtu = (1 == c->msgused);
+
+        /* We may need to start a new msghdr if this one is full. */
+        if (m->msg_iovlen == IOV_MAX ||
+            (limit_to_mtu && c->msgbytes >= MAX_PAYLOAD_SIZE)) {
+            add_msghdr(c);
+            m = &c->msglist[c->msgused - 1];
+        }
+
+        if (ensure_iov_space(c) != 0)
+            return -1;
+
+        /* If the fragment is too big to fit in the datagram, split it up */
+        if (limit_to_mtu && len + c->msgbytes > MAX_PAYLOAD_SIZE) {
+            leftover = len + c->msgbytes - MAX_PAYLOAD_SIZE;
+            len -= leftover;
+        } else {
+            leftover = 0;
+        }
+
+        m = &c->msglist[c->msgused - 1];
+        m->msg_iov[m->msg_iovlen].iov_base = (void *)buf;
+        m->msg_iov[m->msg_iovlen].iov_len = len;
+
+        c->msgbytes += len;
+        c->iovused++;
+        m->msg_iovlen++;
+
+        buf = ((char *)buf) + len;
+        len = leftover;
+    } while (leftover > 0);
+
+    return 0;
+}
+
+
+static void out_string(conn *c, const char *str) {
+    size_t len;
+
+    assert(c != NULL);
+
+    if (c->noreply) {
+        if (settings.verbose > 1)
+            fprintf(stderr, ">%d %s\n", c->sfd, str);
+        c->noreply = false;
+        conn_set_state(c, conn_read);
+        return;
+    }    
+
+    len = strlen(str);
+    if ((len + 2) > c->wsize) {
+        /* ought to be always enough. just fail for simplicity */
+        str = "SERVER_ERROR output line too long";
+        len = strlen(str);
+    }
+
+    memcpy(c->wbuf, str, len);
+    memcpy(c->wbuf + len, "\r\n", 2);
+    c->wbytes = len + 2;
+    c->wcurr = c->wbuf;
+
+    conn_set_state(c, conn_write);
+    c->write_and_go = conn_read;
+    return;
+}
+
+/*
+ * we get here after reading the value in set/add/replace commands. The command
+ * has been stored in c->item_comm, and the item is ready in c->item.
+ */
+
+static void complete_nread(conn *c) {
+    assert(c != NULL);
+
+    item *it = c->item;
+    int comm = c->item_comm;
+    int ret;
+
+    STATS_LOCK();
+    stats.set_cmds++;
+    STATS_UNLOCK();
+
+    if (strncmp(ITEM_data(it) + it->nbytes - 2, "\r\n", 2) != 0) {
+        out_string(c, "CLIENT_ERROR bad data chunk");
+    } else {
+      ret = store_item(it, comm);
+      if (ret == 0)
+          out_string(c, "STORED");
+      else if(ret == -2)
+          out_string(c, "EXISTS");
+      else if(ret == -3)
+          out_string(c, "NOT_FOUND");
+      else
+          out_string(c, "NOT_STORED");
+    }
+
+    item_free(c->item);
+    c->item = 0;
+}
+
+/*
+ * Stores an item in the cache according to the semantics of one of the set
+ * commands. In threaded mode, this is protected by the cache lock.
+ *
+ * Returns true if the item was stored.
+ */
+int store_item(item *it, int comm) {
+    char *key = ITEM_key(it);
+
+    switch (comm) {
+    case NREAD_SET:
+        return cdb_set2(db, key, it->nkey, ITEM_data(it), it->nbytes - 2, CDB_INSERTCACHE | CDB_OVERWRITE, it->expire);
+    case NREAD_ADD:
+        return cdb_set2(db, key, it->nkey, ITEM_data(it), it->nbytes - 2, CDB_INSERTCACHE | CDB_INSERTIFNOEXIST, it->expire);
+    case NREAD_REPLACE:
+        return cdb_set2(db, key, it->nkey, ITEM_data(it), it->nbytes - 2, CDB_INSERTCACHE | CDB_INSERTIFEXIST, it->expire);
+    }
+    return 0;
+}
+
+/*
+ * adds a delta value to a numeric item.
+ */
+/*
+
+int add_delta(char *key, size_t nkey, int64_t delta, char *buf);
+int add_delta(char* key, size_t nkey, int64_t delta, char *buf) {
+    uint64_t value = hs_incr(db, key, delta);
+    snprintf(buf, INCR_MAX_STORAGE_LEN, "%llu", (unsigned long long)value);
+    return 0;
+}
+*/
+
+typedef struct token_s {
+    char *value;
+    size_t length;
+} token_t;
+
+#define COMMAND_TOKEN 0
+#define SUBCOMMAND_TOKEN 1
+#define KEY_TOKEN 1
+#define KEY_MAX_LENGTH 250
+
+#define MAX_TOKENS 8
+
+/*
+ * Tokenize the command string by replacing whitespace with '\0' and update
+ * the token array tokens with pointer to start of each token and length.
+ * Returns total number of tokens.  The last valid token is the terminal
+ * token (value points to the first unprocessed character of the string and
+ * length zero).
+ *
+ * Usage example:
+ *
+ *  while(tokenize_command(command, ncommand, tokens, max_tokens) > 0) {
+ *      for(int ix = 0; tokens[ix].length != 0; ix++) {
+ *          ...
+ *      }
+ *      ncommand = tokens[ix].value - command;
+ *      command  = tokens[ix].value;
+ *   }
+ */
+static size_t tokenize_command(char *command, token_t *tokens, const size_t max_tokens) {
+    char *s, *e;
+    size_t ntokens = 0;
+
+    assert(command != NULL && tokens != NULL && max_tokens > 1);
+
+    for (s = e = command; ntokens < max_tokens - 1; ++e) {
+        if (*e == ' ') {
+            if (s != e) {
+                tokens[ntokens].value = s;
+                tokens[ntokens].length = e - s;
+                ntokens++;
+                *e = '\0';
+            }
+            s = e + 1;
+        }
+        else if (*e == '\0') {
+            if (s != e) {
+                tokens[ntokens].value = s;
+                tokens[ntokens].length = e - s;
+                ntokens++;
+            }
+
+            break; /* string end */
+        }
+    }
+
+    /*
+     * If we scanned the whole string, the terminal value pointer is null,
+     * otherwise it is the first unprocessed character.
+     */
+    tokens[ntokens].value =  *e == '\0' ? NULL : e;
+    tokens[ntokens].length = 0;
+    ntokens++;
+
+    return ntokens;
+}
+
+static inline bool set_noreply_maybe(conn *c, token_t *tokens, size_t ntokens)
+{
+    int noreply_index = ntokens - 2;
+
+    /*
+      NOTE: this function is not the first place where we are going to
+      send the reply.  We could send it instead from process_command()
+      if the request line has wrong number of tokens.  However parsing
+      malformed line for "noreply" option is not reliable anyway, so
+      it can't be helped.
+    */
+    if (tokens[noreply_index].value
+        && strcmp(tokens[noreply_index].value, "noreply") == 0) {
+        c->noreply = true;
+    }
+    return c->noreply;
+}
+
+static void process_stat(conn *c, token_t *tokens, const size_t ntokens) {
+    time_t now = time(0);
+    char *command;
+    char *subcommand;
+
+    assert(c != NULL);
+
+    if(ntokens < 2) {
+        out_string(c, "CLIENT_ERROR bad command line");
+        return;
+    }
+
+    command = tokens[COMMAND_TOKEN].value;
+
+    if (ntokens == 2 && strcmp(command, "stats") == 0) {
+        char temp[1024];
+        pid_t pid = getpid();
+        uint64_t total = 0, curr = 0;
+        CDBSTAT db_stat;
+        cdb_stat(db, &db_stat);
+        total = db_stat.rnum;
+        char *pos = temp;
+
+#ifndef WIN32
+        struct rusage usage;
+        getrusage(RUSAGE_SELF, &usage);
+#endif /* !WIN32 */
+
+        STATS_LOCK();
+        pos += sprintf(pos, "STAT pid %ld\r\n", (long)pid);
+        pos += sprintf(pos, "STAT uptime %"PRIuS"\r\n", now - stats.started);
+        pos += sprintf(pos, "STAT time %"PRIuS"\r\n", now);
+        pos += sprintf(pos, "STAT version " VERSION "\r\n");
+        pos += sprintf(pos, "STAT pointer_size %"PRIuS"\r\n", 8 * sizeof(void *));
+#ifndef WIN32
+        pos += sprintf(pos, "STAT rusage_user %ld.%06ld\r\n", usage.ru_utime.tv_sec, usage.ru_utime.tv_usec);
+        pos += sprintf(pos, "STAT rusage_system %ld.%06ld\r\n", usage.ru_stime.tv_sec, usage.ru_stime.tv_usec);
+        pos += sprintf(pos, "STAT rusage_minflt %"PRIu64"\r\n", usage.ru_minflt);
+        pos += sprintf(pos, "STAT rusage_majflt %"PRIu64"\r\n", usage.ru_majflt);
+        pos += sprintf(pos, "STAT rusage_nswap %"PRIu64"\r\n", usage.ru_nswap);
+        pos += sprintf(pos, "STAT rusage_inblock %"PRIu64"\r\n", usage.ru_inblock);
+        pos += sprintf(pos, "STAT rusage_oublock %"PRIu64"\r\n", usage.ru_oublock);
+        pos += sprintf(pos, "STAT rusage_nvcsw %"PRIu64"\r\n", usage.ru_nvcsw);
+        pos += sprintf(pos, "STAT rusage_nivcsw %"PRIu64"\r\n", usage.ru_nivcsw);
+#endif /* !WIN32 */
+#ifdef HAVE_READPROC
+        proc_t p;
+        get_proc_stats(getpid(), &p);
+        pos += sprintf(pos, "STAT rusage_maxrss %"PRIu64"\r\n", p.vm_rss);
+#endif
+        pos += sprintf(pos, "STAT item_buf_size %"PRIuS"\r\n", settings.item_buf_size);
+        pos += sprintf(pos, "STAT curr_connections %"PRIu32"\r\n", stats.curr_conns - 1); /* ignore listening conn */
+        pos += sprintf(pos, "STAT total_connections %"PRIu32"\r\n", stats.total_conns);
+        pos += sprintf(pos, "STAT connection_structures %"PRIu32"\r\n", stats.conn_structs);
+        pos += sprintf(pos, "STAT cmd_get %"PRIu64"\r\n", stats.get_cmds);
+        pos += sprintf(pos, "STAT cmd_set %"PRIu64"\r\n", stats.set_cmds);
+        pos += sprintf(pos, "STAT cmd_delete %"PRIu64"\r\n", stats.delete_cmds);
+        pos += sprintf(pos, "STAT slow_cmd %"PRIu64"\r\n", stats.slow_cmds);
+        pos += sprintf(pos, "STAT get_hits %"PRIu64"\r\n", stats.get_hits);
+        pos += sprintf(pos, "STAT get_misses %"PRIu64"\r\n", stats.get_misses);
+        pos += sprintf(pos, "STAT curr_items %"PRIu64"\r\n", curr); 
+        pos += sprintf(pos, "STAT total_items %"PRIu64"\r\n", total); 
+        pos += sprintf(pos, "STAT bytes_read %"PRIu64"\r\n", stats.bytes_read);
+        pos += sprintf(pos, "STAT bytes_written %"PRIu64"\r\n", stats.bytes_written);
+        pos += sprintf(pos, "STAT threads %d\r\n", settings.num_threads);
+        pos += sprintf(pos, "STAT records_in_cache %lu\r\n", db_stat.rcnum);
+        pos += sprintf(pos, "STAT pages_total %lu\r\n", db_stat.pnum);
+        pos += sprintf(pos, "STAT pages_in_cache %lu\r\n", db_stat.pcnum);
+        pos += sprintf(pos, "STAT record_cache_hits %lu\r\n", db_stat.rchit);
+        pos += sprintf(pos, "STAT record_cache_misses %lu\r\n", db_stat.rcmiss);
+        pos += sprintf(pos, "STAT page_cache_hits %lu\r\n", db_stat.pchit);
+        pos += sprintf(pos, "STAT page_cache_misses %lu\r\n", db_stat.pcmiss);
+        pos += sprintf(pos, "STAT read_latency_avg  %u\r\n", db_stat.rlatcy);
+        pos += sprintf(pos, "STAT write_latency_avg %u\r\n", db_stat.wlatcy);
+        pos += sprintf(pos, "END");
+        STATS_UNLOCK();
+        out_string(c, temp);
+        return;
+    }
+
+    subcommand = tokens[SUBCOMMAND_TOKEN].value;
+
+    if (strcmp(subcommand, "reset") == 0) {
+        stats_reset();
+        out_string(c, "RESET");
+        return;
+    }
+
+    out_string(c, "ERROR");
+}
+
+/* ntokens is overwritten here... shrug.. */
+static inline void process_get_command(conn *c, token_t *tokens, size_t ntokens) {
+    char *key;
+    size_t nkey;
+    int i = 0;
+    item *it = NULL;
+    token_t *key_token = &tokens[KEY_TOKEN];
+    int stats_get_cmds   = 0;
+    int stats_get_hits   = 0;
+    int stats_get_misses = 0;
+    assert(c != NULL);
+
+    do {
+        while(key_token->length != 0) {
+
+            key = key_token->value;
+            nkey = key_token->length;
+
+            if(nkey > KEY_MAX_LENGTH) {
+                STATS_LOCK();
+                stats.get_cmds   += stats_get_cmds;
+                stats.get_hits   += stats_get_hits;
+                stats.get_misses += stats_get_misses;
+                STATS_UNLOCK();
+                out_string(c, "CLIENT_ERROR bad command line format");
+                return;
+            }
+
+            stats_get_cmds++;
+            
+            it = item_get(key, nkey);
+
+            if (it) {
+                if (i >= c->isize) {
+                    item **new_list = realloc(c->ilist, sizeof(item *) * c->isize * 2);
+                    if (new_list) {
+                        c->isize *= 2;
+                        c->ilist = new_list;
+                    } else { 
+                        item_free(it);
+                        it = NULL;
+                        break;
+                    }
+                }
+
+                /*
+                 * Construct the response. Each hit adds three elements to the
+                 * outgoing data list:
+                 *   "VALUE "
+                 *   key
+                 *   " " + flags + " " + data length + "\r\n" + data (with \r\n)
+                 */
+
+                if (add_iov(c, "VALUE ", 6) != 0 ||
+                   add_iov(c, ITEM_key(it), it->nkey) != 0 ||
+                   add_iov(c, ITEM_suffix(it), it->nsuffix + it->nbytes) != 0)
+                   {
+                       item_free(it);
+                       it = NULL;
+                       break;
+                   }
+
+                if (settings.verbose > 1)
+                    fprintf(stderr, ">%d sending key %s\n", c->sfd, ITEM_key(it));
+
+                stats_get_hits++;
+                *(c->ilist + i) = it;
+                i++;
+
+            } else {
+                stats_get_misses++;
+            }
+
+            key_token++;
+        }
+
+        /*
+         * If the command string hasn't been fully processed, get the next set
+         * of tokens.
+         */
+        if(key_token->value != NULL) {
+            ntokens = tokenize_command(key_token->value, tokens, MAX_TOKENS);
+            key_token = tokens;
+        }
+
+    } while(key_token->value != NULL);
+
+    c->icurr = c->ilist;
+    c->ileft = i;
+
+    if (settings.verbose > 1)
+        fprintf(stderr, ">%d END\n", c->sfd);
+
+    /*
+        If the loop was terminated because of out-of-memory, it is not
+        reliable to add END\r\n to the buffer, because it might not end
+        in \r\n. So we send SERVER_ERROR instead.
+    */
+    if (key_token->value != NULL || add_iov(c, "END\r\n", 5) != 0) {
+        out_string(c, "SERVER_ERROR out of memory writing get response");
+    }
+    else {
+        conn_set_state(c, conn_mwrite);
+        c->msgcurr = 0;
+    }
+
+    STATS_LOCK();
+    stats.get_cmds   += stats_get_cmds;
+    stats.get_hits   += stats_get_hits;
+    stats.get_misses += stats_get_misses;
+    STATS_UNLOCK();
+
+    return;
+}
+
+static void process_update_command(conn *c, token_t *tokens, const size_t ntokens, int comm) {
+    char *key;
+    size_t nkey;
+    int flags;
+    time_t exptime;
+    int vlen;
+    item *it = NULL;
+
+    assert(c != NULL);
+
+    set_noreply_maybe(c, tokens, ntokens);
+
+    if (tokens[KEY_TOKEN].length > KEY_MAX_LENGTH) {
+        out_string(c, "CLIENT_ERROR bad command line format");
+        return;
+    }
+
+    key = tokens[KEY_TOKEN].value;
+    nkey = tokens[KEY_TOKEN].length;
+
+    flags = strtoul(tokens[2].value, NULL, 10);
+    exptime = strtol(tokens[3].value, NULL, 10);
+    vlen = strtol(tokens[4].value, NULL, 10);
+
+    if(errno == ERANGE || ((flags == 0 || exptime == 0) && errno == EINVAL)
+       || vlen < 0) {
+        out_string(c, "CLIENT_ERROR bad command line format");
+        return;
+    }
+
+    it = item_alloc1(key, nkey, flags, vlen+2);
+    it->expire = exptime;
+    it->flag = flags;
+
+    if (it == NULL) {
+        out_string(c, "SERVER_ERROR out of memory storing object");
+        /* swallow the data line */
+        c->write_and_go = conn_swallow;
+        c->sbytes = vlen + 2;
+        return;
+    }
+
+    c->item = it;
+    c->ritem = ITEM_data(it);
+    c->rlbytes = it->nbytes;
+    c->item_comm = comm;
+    conn_set_state(c, conn_nread);
+}
+
+bool safe_strtoull(const char *str, uint64_t *out) {
+    assert(out != NULL);
+    errno = 0;
+    *out = 0;
+    char *endptr;
+    unsigned long long ull = strtoull(str, &endptr, 10);
+    if (errno == ERANGE)
+        return false;
+    if (isspace(*endptr) || (*endptr == '\0' && endptr != str)) {
+        *out = ull;
+        return true;
+    }
+    return false;
+}
+
+/*
+
+
+static void process_arithmetic_command(conn *c, token_t *tokens, const size_t ntokens, const bool incr) {
+    char temp[INCR_MAX_STORAGE_LEN];
+    uint64_t delta;
+    char *key;
+    size_t nkey;
+
+    assert(c != NULL);
+
+    set_noreply_maybe(c, tokens, ntokens);
+ 
+    STATS_LOCK();
+    stats.set_cmds++;
+    STATS_UNLOCK();
+
+    if (tokens[KEY_TOKEN].length > KEY_MAX_LENGTH) {
+        out_string(c, "CLIENT_ERROR bad command line format");
+        return;
+    }
+
+    key = tokens[KEY_TOKEN].value;
+    nkey = tokens[KEY_TOKEN].length;
+
+    if (!safe_strtoull(tokens[2].value, &delta)) {
+        out_string(c, "CLIENT_ERROR invalid numeric delta argument");
+        return;
+    }
+    
+    switch(add_delta(key, nkey, delta, temp)) {
+    case 0:
+        out_string(c, temp);
+        break;
+//    case NON_NUMERIC:
+//        out_string(c, "CLIENT_ERROR cannot increment or decrement non-numeric value");
+//        break;
+//    case EOM:
+//        out_string(c, "SERVER_ERROR out of memory");
+//        break;
+    }
+}
+*/
+
+
+static void process_delete_command(conn *c, token_t *tokens, const size_t ntokens) {
+    char *key;
+    size_t nkey;
+    assert(c != NULL);
+    
+    set_noreply_maybe(c, tokens, ntokens);
+    
+    STATS_LOCK();
+    stats.delete_cmds++;
+    STATS_UNLOCK();
+
+    key = tokens[KEY_TOKEN].value;
+    nkey = tokens[KEY_TOKEN].length;
+    if(nkey > KEY_MAX_LENGTH) {
+        out_string(c, "CLIENT_ERROR bad command line format");
+        return;
+    }
+
+    switch (cdb_del(db, key, nkey)) {
+    case 0:
+        out_string(c, "DELETED");
+        break;
+    case -3:
+        out_string(c, "NOT_FOUND");
+        break;
+//    case -1:
+//        out_string(c, "SERVER_ERROR while delete a item");
+//        break;
+//    default:
+//        out_string(c, "SERVER_ERROR nothing to do");
+    }
+    return;
+}
+
+static void process_verbosity_command(conn *c, token_t *tokens, const size_t ntokens) {
+    unsigned int level;
+
+    assert(c != NULL);
+    
+    set_noreply_maybe(c, tokens, ntokens);
+
+    level = strtoul(tokens[1].value, NULL, 10);
+    if(errno == ERANGE) {
+        out_string(c, "CLIENT_ERROR bad command line format");
+        return;
+    }
+    settings.verbose = level > MAX_VERBOSITY_LEVEL ? MAX_VERBOSITY_LEVEL : level;
+    out_string(c, "OK");
+    return;
+}
+
+static void process_command(conn *c, char *command) {
+
+    token_t tokens[MAX_TOKENS];
+    size_t ntokens;
+    int comm;
+    struct timespec start, end;
+
+    assert(c != NULL);
+
+    if (settings.verbose > 1)
+        fprintf(stderr, "<%d %s\n", c->sfd, command);
+
+    /*
+     * for commands set/add/replace, we build an item and read the data
+     * directly into it, then continue in nread_complete().
+     */
+
+    c->msgcurr = 0;
+    c->msgused = 0;
+    c->iovused = 0;
+    if (add_msghdr(c) != 0) {
+        out_string(c, "SERVER_ERROR out of memory preparing response");
+        return;
+    }
+
+    clock_gettime(CLOCK_MONOTONIC, &start);          
+
+    ntokens = tokenize_command(command, tokens, MAX_TOKENS);
+    if (ntokens >= 3 &&
+        (strcmp(tokens[COMMAND_TOKEN].value, "get") == 0) ) {
+
+        process_get_command(c, tokens, ntokens);
+
+    } else if ((ntokens == 6 || ntokens == 7) &&
+                ((strcmp(tokens[COMMAND_TOKEN].value, "set") == 0 && (comm = NREAD_SET)) || 
+                 (strcmp(tokens[COMMAND_TOKEN].value, "add") == 0 && (comm = NREAD_ADD)) ||
+                 (strcmp(tokens[COMMAND_TOKEN].value, "replace") == 0 && (comm = NREAD_REPLACE)))) {
+
+        process_update_command(c, tokens, ntokens, comm);
+
+//    } else if ((ntokens == 4 || ntokens == 5) && (strcmp(tokens[COMMAND_TOKEN].value, "incr") == 0)) {
+
+//            process_arithmetic_command(c, tokens, ntokens, 1);
+
+    } else if (ntokens >= 3 && ntokens <= 4 && (strcmp(tokens[COMMAND_TOKEN].value, "delete") == 0)) {
+
+        process_delete_command(c, tokens, ntokens);
+
+    } else if (ntokens >= 2 && (strcmp(tokens[COMMAND_TOKEN].value, "stats") == 0)) {
+
+        process_stat(c, tokens, ntokens);
+
+    } else if (ntokens == 2 && (strcmp(tokens[COMMAND_TOKEN].value, "version") == 0)) {
+
+        out_string(c, "VERSION " VERSION);
+
+    } else if (ntokens == 2 && (strcmp(tokens[COMMAND_TOKEN].value, "quit") == 0)) {
+
+        conn_set_state(c, conn_closing);
+
+    } else if (ntokens == 3 && (strcmp(tokens[COMMAND_TOKEN].value, "verbosity") == 0)) {
+
+        process_verbosity_command(c, tokens, ntokens);
+    
+/*    } else if (ntokens >= 2 && ntokens <= 4 && (strcmp(tokens[COMMAND_TOKEN].value, "flush_all") == 0)) {
+
+        set_noreply_maybe(c, tokens, ntokens);
+
+        int limit = 10000;
+        if (ntokens == (c->noreply ? 4 : 3)) {
+            limit = strtol(tokens[1].value, NULL, 10);
+            if(errno == ERANGE) {
+                out_string(c, "CLIENT_ERROR bad command line format");
+                return;
+            }
+        }
+        
+        hs_optimize(db, limit);
+        out_string(c, "OK");
+        return;
+*/
+    } else {
+        out_string(c, "ERROR");
+        return;
+    }
+    
+    clock_gettime(CLOCK_MONOTONIC, &end);
+    float secs = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) / 1e9;
+    if (secs > settings.slow_cmd_time) {
+        STATS_LOCK();
+        stats.slow_cmds ++;
+        STATS_UNLOCK();
+    }
+
+    // access logging
+    if (NULL != access_log && ntokens >= 3) {
+        char now[255];
+        time_t t = time(NULL);
+        strftime(now, 200, "%Y-%m-%d %H:%M:%S", localtime(&t));
+        struct sockaddr_storage addr;
+        socklen_t addrlen = sizeof(addr);
+        getpeername(c->sfd, (struct sockaddr*)&addr, &addrlen);
+        char host[NI_MAXHOST], serv[NI_MAXSERV];
+        getnameinfo((struct sockaddr*)&addr, addrlen,  host, sizeof(host), serv, sizeof(serv), 
+                NI_NUMERICSERV);
+        fprintf(access_log, "%s %s:%s %s %s %.3f\n", now, host, serv, 
+                command, tokens[1].value, secs*1000);
+    }
+
+    return;
+}
+
+/*
+ * if we have a complete line in the buffer, process it.
+ */
+static int try_read_command(conn *c) {
+    char *el, *cont;
+
+    assert(c != NULL);
+    assert(c->rcurr <= (c->rbuf + c->rsize));
+
+    if (c->rbytes == 0)
+        return 0;
+    el = memchr(c->rcurr, '\n', c->rbytes);
+    if (!el)
+        return 0;
+    cont = el + 1;
+    if ((el - c->rcurr) > 1 && *(el - 1) == '\r') {
+        el--;
+    }
+    *el = '\0';
+
+    assert(cont <= (c->rcurr + c->rbytes));
+
+    process_command(c, c->rcurr);
+
+    c->rbytes -= (cont - c->rcurr);
+    c->rcurr = cont;
+
+    assert(c->rcurr <= (c->rbuf + c->rsize));
+
+    return 1;
+}
+
+/*
+ * read from network as much as we can, handle buffer overflow and connection
+ * close.
+ * before reading, move the remaining incomplete fragment of a command
+ * (if any) to the beginning of the buffer.
+ * return 0 if there's nothing to read on the first read.
+ */
+static int try_read_network(conn *c) {
+    int gotdata = 0;
+    int res;
+
+    assert(c != NULL);
+
+    if (c->rcurr != c->rbuf) {
+        if (c->rbytes != 0) /* otherwise there's nothing to copy */
+            memmove(c->rbuf, c->rcurr, c->rbytes);
+        c->rcurr = c->rbuf;
+    }
+
+    while (1) {
+        if (c->rbytes >= c->rsize) {
+            char *new_rbuf = realloc(c->rbuf, c->rsize * 2);
+            if (!new_rbuf) {
+                if (settings.verbose > 0)
+                    fprintf(stderr, "Couldn't realloc input buffer\n");
+                c->rbytes = 0; /* ignore what we read */
+                out_string(c, "SERVER_ERROR out of memory reading request");
+                c->write_and_go = conn_closing;
+                return 1;
+            }
+            c->rcurr = c->rbuf = new_rbuf;
+            c->rsize *= 2;
+        }
+
+
+        int avail = c->rsize - c->rbytes;
+        res = read(c->sfd, c->rbuf + c->rbytes, avail);
+        if (res > 0) {
+            STATS_LOCK();
+            stats.bytes_read += res;
+            STATS_UNLOCK();
+            gotdata = 1;
+            c->rbytes += res;
+            if (res == avail) {
+                continue;
+            } else {
+                break;
+            }
+        }
+        if (res == 0) {
+            /* connection closed */
+            conn_set_state(c, conn_closing);
+            return 1;
+        }
+        if (res == -1) {
+            if (errno == EAGAIN || errno == EWOULDBLOCK) break;
+            /* Should close on unhandled errors. */
+            conn_set_state(c, conn_closing);
+            return 1;
+        }
+    }
+    return gotdata;
+}
+
+static bool update_event(conn *c, const int new_flags) {
+    c->ev_flags = new_flags;
+    return true;
+}
+
+/*
+ * Transmit the next chunk of data from our list of msgbuf structures.
+ *
+ * Returns:
+ *   TRANSMIT_COMPLETE   All done writing.
+ *   TRANSMIT_INCOMPLETE More data remaining to write.
+ *   TRANSMIT_SOFT_ERROR Can't write any more right now.
+ *   TRANSMIT_HARD_ERROR Can't write (c->state is set to conn_closing)
+ */
+static int transmit(conn *c) {
+    assert(c != NULL);
+
+    if (c->msgcurr < c->msgused &&
+            c->msglist[c->msgcurr].msg_iovlen == 0) {
+        /* Finished writing the current msg; advance to the next. */
+        c->msgcurr++;
+    }
+    if (c->msgcurr < c->msgused) {
+        ssize_t res;
+        struct msghdr *m = &c->msglist[c->msgcurr];
+
+        res = sendmsg(c->sfd, m, 0);
+        if (res > 0) {
+            STATS_LOCK();
+            stats.bytes_written += res;
+            STATS_UNLOCK();
+
+            /* We've written some of the data. Remove the completed
+               iovec entries from the list of pending writes. */
+            while (m->msg_iovlen > 0 && res >= m->msg_iov->iov_len) {
+                res -= m->msg_iov->iov_len;
+                m->msg_iovlen--;
+                m->msg_iov++;
+            }
+
+            /* Might have written just part of the last iovec entry;
+               adjust it so the next write will do the rest. */
+            if (res > 0) {
+                m->msg_iov->iov_base += res;
+                m->msg_iov->iov_len -= res;
+            }
+            return TRANSMIT_INCOMPLETE;
+        }
+        if (res == -1 && (errno == EAGAIN || errno == EWOULDBLOCK)) {
+            update_event(c, AE_WRITABLE);
+            return TRANSMIT_SOFT_ERROR;
+        }
+        /* if res==0 or res==-1 and error is not EAGAIN or EWOULDBLOCK,
+           we have a real error, on which we close the connection */
+        if (settings.verbose > 0)
+            perror("Failed to write, and not due to blocking");
+
+        conn_set_state(c, conn_closing);
+        return TRANSMIT_HARD_ERROR;
+    } else {
+        return TRANSMIT_COMPLETE;
+    }
+}
+
+void drive_machine(conn *c) {
+    bool stop = false;
+    int sfd, flags = 1;
+    socklen_t addrlen;
+    struct sockaddr_storage addr;
+    int res;
+
+    assert(c != NULL);
+
+    while (!stop) {
+
+        switch(c->state) {
+        case conn_listening:
+            addrlen = sizeof(addr);
+            if ((sfd = accept(c->sfd, (struct sockaddr *)&addr, &addrlen)) == -1) {
+                stop = true;
+                if (errno == EAGAIN || errno == EWOULDBLOCK) {
+                    /* these are transient, so don't log anything */
+                } else if (errno == EMFILE) {
+                    if (settings.verbose > 0)
+                        fprintf(stderr, "Too many open connections\n");
+                    if (stub_fd > 0){
+                        close(stub_fd);
+                        if ((sfd = accept(c->sfd, (struct sockaddr *)&addr, &addrlen)) != -1) {
+                            close(sfd);
+                            stub_fd = open("/dev/null", O_RDONLY);
+                            stop = false;
+                        }else{
+                            if (settings.verbose > 0)
+                                fprintf(stderr, "Too many open connections 2\n");
+                        }
+                    }
+                } else {
+                    perror("accept()");
+                }
+                if (stop) break;
+            }
+            if ((flags = fcntl(sfd, F_GETFL, 0)) < 0 ||
+                fcntl(sfd, F_SETFL, flags | O_NONBLOCK) < 0) {
+                perror("setting O_NONBLOCK");
+                close(sfd);
+                break;
+            }
+            if (NULL == conn_new(sfd, conn_read, DATA_BUFFER_SIZE)) { 
+                if (settings.verbose > 0) {
+                    fprintf(stderr, "Can't listen for events on fd %d\n", sfd);
+                }
+                close(sfd);
+            }
+            break;
+
+        case conn_read:
+            if (try_read_command(c) != 0) {
+                continue;
+            }
+            if (try_read_network(c) != 0) {
+                continue;
+            }
+            /* we have no command line and no data to read from network */
+            update_event(c, AE_READABLE);
+            stop = true;
+            break;
+
+        case conn_nread:
+            /* we are reading rlbytes into ritem; */
+            if (c->rlbytes == 0) {
+                complete_nread(c);
+                break;
+            }
+            /* first check if we have leftovers in the conn_read buffer */
+            if (c->rbytes > 0) {
+                int tocopy = c->rbytes > c->rlbytes ? c->rlbytes : c->rbytes;
+                memcpy(c->ritem, c->rcurr, tocopy);
+                c->ritem += tocopy;
+                c->rlbytes -= tocopy;
+                c->rcurr += tocopy;
+                c->rbytes -= tocopy;
+                break;
+            }
+
+            /*  now try reading from the socket */
+            res = read(c->sfd, c->ritem, c->rlbytes);
+            if (res > 0) {
+                STATS_LOCK();
+                stats.bytes_read += res;
+                STATS_UNLOCK();
+                c->ritem += res;
+                c->rlbytes -= res;
+                break;
+            }
+            if (res == 0) { /* end of stream */
+                conn_set_state(c, conn_closing);
+                break;
+            }
+            if (res == -1 && (errno == EAGAIN || errno == EWOULDBLOCK)) {
+                update_event(c, AE_READABLE);
+                stop = true;
+                break;
+            }
+            /* otherwise we have a real error, on which we close the connection */
+            if (settings.verbose > 0)
+                fprintf(stderr, "Failed to read, and not due to blocking\n");
+            conn_set_state(c, conn_closing);
+            break;
+
+        case conn_swallow:
+            /* we are reading sbytes and throwing them away */
+            if (c->sbytes == 0) {
+                conn_set_state(c, conn_read);
+                break;
+            }
+
+            /* first check if we have leftovers in the conn_read buffer */
+            if (c->rbytes > 0) {
+                int tocopy = c->rbytes > c->sbytes ? c->sbytes : c->rbytes;
+                c->sbytes -= tocopy;
+                c->rcurr += tocopy;
+                c->rbytes -= tocopy;
+                break;
+            }
+
+            /*  now try reading from the socket */
+            res = read(c->sfd, c->rbuf, c->rsize > c->sbytes ? c->sbytes : c->rsize);
+            if (res > 0) {
+                STATS_LOCK();
+                stats.bytes_read += res;
+                STATS_UNLOCK();
+                c->sbytes -= res;
+                break;
+            }
+            if (res == 0) { /* end of stream */
+                conn_set_state(c, conn_closing);
+                break;
+            }
+            if (res == -1 && (errno == EAGAIN || errno == EWOULDBLOCK)) {
+                update_event(c, AE_READABLE);
+                stop = true;
+                break;
+            }
+            /* otherwise we have a real error, on which we close the connection */
+            if (settings.verbose > 0)
+                fprintf(stderr, "Failed to read, and not due to blocking\n");
+            conn_set_state(c, conn_closing);
+            break;
+
+        case conn_write:
+            /*
+             * We want to write out a simple response. If we haven't already,
+             * assemble it into a msgbuf list (this will be a single-entry
+             * list for TCP or a two-entry list for UDP).
+             */
+            if (c->iovused == 0) {
+                if (add_iov(c, c->wcurr, c->wbytes) != 0) {
+                    if (settings.verbose > 0)
+                        fprintf(stderr, "Couldn't build response\n");
+                    conn_set_state(c, conn_closing);
+                    break;
+                }
+            }
+
+            /* fall through... */
+
+        case conn_mwrite:
+            switch (transmit(c)) {
+            case TRANSMIT_COMPLETE:
+                if (c->state == conn_mwrite) {
+                    while (c->ileft > 0) {
+                        item *it = *(c->icurr);
+                        item_free(it);
+                        c->icurr++;
+                        c->ileft--;
+                    }
+                    conn_set_state(c, conn_read);
+                } else if (c->state == conn_write) {
+                    if (c->write_and_free) {
+                        free(c->write_and_free);
+                        c->write_and_free = 0;
+                    }
+                    conn_set_state(c, c->write_and_go);
+                } else {
+                    if (settings.verbose > 0)
+                        fprintf(stderr, "Unexpected state %d\n", c->state);
+                    conn_set_state(c, conn_closing);
+                }
+                break;
+
+            case TRANSMIT_INCOMPLETE:
+            case TRANSMIT_HARD_ERROR:
+                break;                   /* Continue in state machine. */
+
+            case TRANSMIT_SOFT_ERROR:
+                stop = true;
+                break;
+            }
+            break;
+
+        case conn_closing:
+            conn_close(c);
+            stop = true;
+            break;
+        }
+    }
+
+    return;
+}
+
+static int new_socket(struct addrinfo *ai) {
+    int sfd;
+    int flags;
+
+    if ((sfd = socket(ai->ai_family, ai->ai_socktype, ai->ai_protocol)) == -1) {
+        perror("socket()");
+        return -1;
+    }
+
+    if ((flags = fcntl(sfd, F_GETFL, 0)) < 0 ||
+        fcntl(sfd, F_SETFL, flags | O_NONBLOCK) < 0) {
+        perror("setting O_NONBLOCK");
+        close(sfd);
+        return -1;
+    }
+    return sfd;
+}
+
+static int server_socket(const int port, const bool is_udp) {
+    int sfd;
+    struct linger ling = {0, 0};
+    struct addrinfo *ai;
+    struct addrinfo *next;
+    struct addrinfo hints;
+    char port_buf[NI_MAXSERV];
+    int error;
+    int success = 0;
+
+    int flags =1;
+
+    /*
+     * the memset call clears nonstandard fields in some impementations
+     * that otherwise mess things up.
+     */
+    memset(&hints, 0, sizeof (hints));
+    hints.ai_flags = AI_PASSIVE|AI_ADDRCONFIG;
+    hints.ai_family = AF_UNSPEC;
+    hints.ai_protocol = IPPROTO_TCP;
+    hints.ai_socktype = SOCK_STREAM;
+
+    snprintf(port_buf, NI_MAXSERV, "%d", port);
+    error= getaddrinfo(settings.inter, port_buf, &hints, &ai);
+    if (error != 0) {
+      if (error != EAI_SYSTEM)
+        fprintf(stderr, "getaddrinfo(): %s\n", gai_strerror(error));
+      else
+        perror("getaddrinfo()");
+
+      return 1;
+    }
+
+    for (next= ai; next; next= next->ai_next) {
+        conn *listen_conn_add;
+        if ((sfd = new_socket(next)) == -1) {
+            freeaddrinfo(ai);
+            return 1;
+        }
+
+        setsockopt(sfd, SOL_SOCKET, SO_REUSEADDR, (void *)&flags, sizeof(flags));
+        setsockopt(sfd, SOL_SOCKET, SO_KEEPALIVE, (void *)&flags, sizeof(flags));
+        setsockopt(sfd, SOL_SOCKET, SO_LINGER, (void *)&ling, sizeof(ling));
+        setsockopt(sfd, IPPROTO_TCP, TCP_NODELAY, (void *)&flags, sizeof(flags));
+
+        if (bind(sfd, next->ai_addr, next->ai_addrlen) == -1) {
+            if (errno != EADDRINUSE) {
+                perror("bind()");
+                close(sfd);
+                freeaddrinfo(ai);
+                return 1;
+            }
+            close(sfd);
+            continue;
+        } else {
+          success++;
+          if (listen(sfd, 1024) == -1) {
+              perror("listen()");
+              close(sfd);
+              freeaddrinfo(ai);
+              return 1;
+          }
+      }
+
+      if (!(listen_conn_add = conn_new(sfd, conn_listening, 1))) {
+          fprintf(stderr, "failed to create listening connection\n");
+          exit(EXIT_FAILURE);
+      }
+    }
+
+    freeaddrinfo(ai);
+
+    /* Return zero iff we detected no errors in starting up connections */
+    return success == 0;
+}
+
+static void usage(void) {
+    printf(PACKAGE " " VERSION "\n");
+    printf("-p <num>      TCP port number to listen on (default: 8964)\n"
+           "-l <ip_addr>  interface to listen on, default is INDRR_ANY\n"
+           "-d            run as a daemon\n"
+           "-P <num>      page cache limit(MB), default 256(MB)\n"
+           "-r <num>      record cache limit(MB), default 256(MB)\n"
+           "-R <num>      bytes for a disk read operation, must be between[1024,65535), recommend to be larger than most small records, default is 4096(Bytes)\n"
+           "-L <file>     log file\n"
+           "-u <username> assume identity of <username> (only when run as root)\n"
+           "-c <num>      max simultaneous connections, default is 1024\n"
+           "-t <num>      number of threads to use, default 16\n"
+           "-H <dir>      home of database, default is 'testdb', keep sure the directory exists\n"
+           "-s <num>      slow command time limit, in ms, default is 100ms\n"
+           "-n            main hash table size, recommend to be 1%% - 10%% of maximum record num, default is 1000000\n"
+           "-v            verbose (print errors/warnings while in event loop)\n"
+           "-vv           very verbose (also print client commands/reponses)\n"
+           "-h            print this help and exit\n"
+           "-i            print license info\n"
+           );
+
+    return;
+}
+
+static void usage_license(void) {
+    printf(PACKAGE " " VERSION "\n\n");
+    printf(
+    "Copyright (c) 2012, Siyuan Fu. <fusiyuan2010@gmail.com>\n"
+    "All rights reserved.\n"
+    "\n"
+    "\n"
+    "This product includes software developed by Douban Inc.\n"
+    "\n"
+    "[ Beansdb ]\n"
+    "\n"
+    "Copyright (c) 2009, Douban Inc. <http://www.douban.com/>\n"
+    "All rights reserved.\n"
+    "\n"
+    "Redistribution and use in source and binary forms, with or without\n"
+    "modification, are permitted provided that the following conditions are\n"
+    "met:\n"
+    "\n"
+    "    * Redistributions of source code must retain the above copyright\n"
+    "notice, this list of conditions and the following disclaimer.\n"
+    "\n"
+    "    * Redistributions in binary form must reproduce the above\n"
+    "copyright notice, this list of conditions and the following disclaimer\n"
+    "in the documentation and/or other materials provided with the\n"
+    "distribution.\n"
+    "\n"
+    "    * Neither the name of the Douban Inc. nor the names of its\n"
+    "contributors may be used to endorse or promote products derived from\n"
+    "this software without specific prior written permission.\n"
+    "\n"
+    "THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS\n"
+    "\"AS IS\" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT\n"
+    "LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR\n"
+    "A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT\n"
+    "OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\n"
+    "SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT\n"
+    "LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\n"
+    "DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY\n"
+    "THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n"
+    "(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n"
+    "OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n"
+    "\n"
+    "\n"
+    "This product includes software developed by Douban Inc.\n"
+    "\n"
+    "[ memcachedb ]\n"
+    "\n"
+    "Copyright (c) 2008, Steve Chu. <stvchu@gmail.com>\n"
+    "All rights reserved.\n"
+    "\n"
+    "Redistribution and use in source and binary forms, with or without\n"
+    "modification, are permitted provided that the following conditions are\n"
+    "met:\n"
+    "\n"
+    "    * Redistributions of source code must retain the above copyright\n"
+    "notice, this list of conditions and the following disclaimer.\n"
+    "\n"
+    "    * Redistributions in binary form must reproduce the above\n"
+    "copyright notice, this list of conditions and the following disclaimer\n"
+    "in the documentation and/or other materials provided with the\n"
+    "distribution.\n"
+    "\n"
+    "    * Neither the name of the Danga Interactive nor the names of its\n"
+    "contributors may be used to endorse or promote products derived from\n"
+    "this software without specific prior written permission.\n"
+    "\n"
+    "THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS\n"
+    "\"AS IS\" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT\n"
+    "LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR\n"
+    "A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT\n"
+    "OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\n"
+    "SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT\n"
+    "LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\n"
+    "DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY\n"
+    "THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n"
+    "(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n"
+    "OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n"
+    "\n"
+    "\n"
+    "This product includes software developed by Danga Interactive, Inc.\n"
+    "\n"
+    "[ memcached ]\n"
+    "\n"
+    "Copyright (c) 2003, Danga Interactive, Inc. <http://www.danga.com/>\n"
+    "All rights reserved.\n"
+    "\n"
+    "Redistribution and use in source and binary forms, with or without\n"
+    "modification, are permitted provided that the following conditions are\n"
+    "met:\n"
+    "\n"
+    "    * Redistributions of source code must retain the above copyright\n"
+    "notice, this list of conditions and the following disclaimer.\n"
+    "\n"
+    "    * Redistributions in binary form must reproduce the above\n"
+    "copyright notice, this list of conditions and the following disclaimer\n"
+    "in the documentation and/or other materials provided with the\n"
+    "distribution.\n"
+    "\n"
+    "    * Neither the name of the Danga Interactive nor the names of its\n"
+    "contributors may be used to endorse or promote products derived from\n"
+    "this software without specific prior written permission.\n"
+    "\n"
+    "THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS\n"
+    "\"AS IS\" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT\n"
+    "LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR\n"
+    "A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT\n"
+    "OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\n"
+    "SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT\n"
+    "LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\n"
+    "DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY\n"
+    "THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n"
+    "(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n"
+    "OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n"
+    "\n"
+    "\n"
+    "This product includes software developed by Niels Provos.\n"
+    "\n"
+    "[ libevent ]\n"
+    "\n"
+    "Copyright 2000-2003 Niels Provos <provos@citi.umich.edu>\n"
+    "All rights reserved.\n"
+    "\n"
+    "Redistribution and use in source and binary forms, with or without\n"
+    "modification, are permitted provided that the following conditions\n"
+    "are met:\n"
+    "1. Redistributions of source code must retain the above copyright\n"
+    "   notice, this list of conditions and the following disclaimer.\n"
+    "2. Redistributions in binary form must reproduce the above copyright\n"
+    "   notice, this list of conditions and the following disclaimer in the\n"
+    "   documentation and/or other materials provided with the distribution.\n"
+    "3. All advertising materials mentioning features or use of this software\n"
+    "   must display the following acknowledgement:\n"
+    "      This product includes software developed by Niels Provos.\n"
+    "4. The name of the author may not be used to endorse or promote products\n"
+    "   derived from this software without specific prior written permission.\n"
+    "\n"
+    "THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR\n"
+    "IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES\n"
+    "OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.\n"
+    "IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,\n"
+    "INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT\n"
+    "NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\n"
+    "DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY\n"
+    "THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n"
+    "(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF\n"
+    "THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n"
+    );
+
+    return;
+}
+
+
+/* for safely exit, make sure to do checkpoint*/
+static void sig_handler(const int sig)
+{
+    if (sig != SIGTERM && sig != SIGQUIT && sig != SIGINT) {
+        return;
+    }
+    if (daemon_quit == 1){
+        return;
+    }
+    daemon_quit = 1;
+    fprintf(stderr, "Signal(%d) received, try to exit daemon gracefully..\n", sig);
+}
+
+
+int main (int argc, char **argv) {
+    int c;
+    //struct in_addr addr;
+    char *dbhome = "testdb";
+    bool daemonize = false;
+    char *username = NULL;
+    FILE *log_file = NULL;
+    struct passwd *pw;
+    struct sigaction sa;
+    struct rlimit rlim;
+    int rcache = 256, pcache = 256;
+    /* recommend for 100,000,000 records*/
+    int db_hsize = 1000000; 
+    int areadsize = 4096;
+
+    /* init settings */
+    settings_init();
+
+    /* set stderr non-buffering (for running under, say, daemontools) */
+    setbuf(stderr, NULL);
+
+    /* process arguments */
+    while ((c = getopt(argc, argv, "a:p:c:hivl:dr:u:P:L:t:b:H:s:n:R:")) != -1) {
+        switch (c) {
+        case 'a':
+            if (strcmp(optarg, "-") == 0) {
+                access_log = stdout;
+            }else{
+                access_log = fopen(optarg, "a");
+                if (NULL == access_log) {
+                    fprintf(stderr, "open access_log %s failed\n", optarg);
+                    exit(1);
+                }
+            }
+            break;
+        case 'p':
+            settings.port = atoi(optarg);
+            break;
+        case 'c':
+            settings.maxconns = atoi(optarg);
+            break;
+        case 'h':
+            usage();
+            exit(EXIT_SUCCESS);
+        case 'i':
+            usage_license();
+            exit(EXIT_SUCCESS);
+        case 'v':
+            settings.verbose++;
+            break;
+        case 'l':
+            settings.inter= strdup(optarg);
+            break;
+        case 'd':
+            daemonize = true;
+            break;
+        case 'r':
+            rcache = atoi(optarg);
+            break;
+        case 'R':
+            areadsize = atoi(optarg);
+            break;
+        case 'u':
+            username = optarg;
+            break;
+        case 'P':
+            pcache = atoi(optarg);
+            break;
+        case 'L':
+            if ((log_file = fopen(optarg, "a")) != NULL){
+                setlinebuf(log_file);
+                fclose(stdout);
+                fclose(stderr);
+                stdout = stderr = log_file;
+            }else{
+                fprintf(stderr, "open log file %s failed\n", optarg);
+            }
+            break;
+        case 't':
+            settings.num_threads = atoi(optarg);
+            if (settings.num_threads == 0) {
+                fprintf(stderr, "Number of threads must be greater than 0\n");
+                exit(EXIT_FAILURE);
+            }
+            break;
+        case 'b':
+            settings.item_buf_size = atoi(optarg);
+            if(settings.item_buf_size < 512){
+                fprintf(stderr, "item buf size must be larger than 512 bytes\n");
+                exit(EXIT_FAILURE);
+            } 
+            if(settings.item_buf_size > 256 * 1024){
+                fprintf(stderr, "Warning: item buffer size(-b) larger than 256KB may cause performance issue\n");
+            } 
+            break;
+        case 'H':
+            dbhome = optarg;
+            break;
+        case 's':
+            settings.slow_cmd_time = atoi(optarg) / 1000.0;
+            break;
+        case 'n':
+            db_hsize = atoi(optarg);
+            break;
+        default:
+            fprintf(stderr, "Illegal argument \"%c\"\n", c);
+            exit(EXIT_FAILURE);
+        }
+    }
+
+    /*
+     * If needed, increase rlimits to allow as many connections
+     * as needed.
+     */
+
+    if (getrlimit(RLIMIT_NOFILE, &rlim) != 0) {
+        fprintf(stderr, "failed to getrlimit number of files\n");
+        exit(EXIT_FAILURE);
+    } else {
+        int maxfiles = settings.maxconns;
+        if (rlim.rlim_cur < maxfiles)
+            rlim.rlim_cur = maxfiles + 3;
+        if (rlim.rlim_max < rlim.rlim_cur)
+            rlim.rlim_max = rlim.rlim_cur;
+        if (setrlimit(RLIMIT_NOFILE, &rlim) != 0) {
+            fprintf(stderr, "failed to set rlimit for open files. Try running as root or requesting smaller maxconns value.\n");
+            exit(EXIT_FAILURE);
+        }
+    }
+
+    /* daemonize if requested */
+    /* if we want to ensure our ability to dump core, don't chdir to / */
+    if (daemonize) {
+        int res;
+        res = daemon(1, settings.verbose || log_file);
+        if (res == -1) {
+            fprintf(stderr, "failed to daemon() in order to daemonize\n");
+            return 1;
+        }
+    }
+
+    /* lose root privileges if we have them */
+    if (getuid() == 0 || geteuid() == 0) {
+        if (username == 0 || *username == '\0') {
+            fprintf(stderr, "can't run as root without the -u switch\n");
+            return 1;
+        }
+        if ((pw = getpwnam(username)) == 0) {
+            fprintf(stderr, "can't find the user %s to switch to\n", username);
+            return 1;
+        }
+        if (setgid(pw->pw_gid) < 0 || setuid(pw->pw_uid) < 0) {
+            fprintf(stderr, "failed to assume identity of user %s\n", username);
+            return 1;
+        }
+    }
+    
+    /* initialize other stuff */
+    item_init();
+    stats_init();
+    conn_init();
+
+    /*
+     * ignore SIGPIPE signals; we can use errno==EPIPE if we
+     * need that information
+     */
+    sa.sa_handler = SIG_IGN;
+    sa.sa_flags = 0;
+    if (sigemptyset(&sa.sa_mask) == -1 ||
+        sigaction(SIGPIPE, &sa, 0) == -1) {
+        perror("failed to ignore SIGPIPE; sigaction");
+        exit(EXIT_FAILURE);
+    }
+
+    /* open db */
+    db = cdb_new();
+    cdb_option(db, db_hsize, rcache, pcache);
+    cdb_option_areadsize(db, areadsize);
+
+    if (cdb_open(db, dbhome, CDB_CREAT | CDB_PAGEWARMUP) < 0) {
+        fprintf(stderr, "failed to open db %s\n", dbhome);
+        exit(1);
+    }
+
+    if ((stub_fd = open("/dev/null", O_RDONLY)) == -1) {
+        perror("open stub file failed");
+        exit(1);
+    }
+    thread_init(settings.num_threads);
+    
+    /* create the listening socket, bind it, and init */
+    if (server_socket(settings.port, false)) {
+        fprintf(stderr, "failed to listen\n");
+        exit(EXIT_FAILURE);
+    }
+
+    /* register signal callback */
+    if (signal(SIGTERM, sig_handler) == SIG_ERR)
+        fprintf(stderr, "can not catch SIGTERM\n");
+    if (signal(SIGQUIT, sig_handler) == SIG_ERR)
+        fprintf(stderr, "can not catch SIGQUIT\n");
+    if (signal(SIGINT,  sig_handler) == SIG_ERR)
+        fprintf(stderr, "can not catch SIGINT\n");
+
+    /* enter the event loop */
+    printf("all ready.\n");
+    loop_run(settings.num_threads);
+
+    /* wait other thread to ends */
+    fprintf(stderr, "waiting for close ... \n");
+    cdb_destroy(db);
+    fprintf(stderr, "done.\n");
+
+    if (log_file) {
+        fclose(log_file);
+    }
+
+    return 0;
+}
+
diff --git a/libdap-cuttdb/src/cuttdb-server.h b/libdap-cuttdb/src/cuttdb-server.h
new file mode 100644
index 0000000000000000000000000000000000000000..90cc9b6271683c058dfacd007fad43ffef239c55
--- /dev/null
+++ b/libdap-cuttdb/src/cuttdb-server.h
@@ -0,0 +1,270 @@
+/*
+ *  Beansdb - A high available distributed key-value storage system:
+ *
+ *      http://beansdb.googlecode.com
+ *
+ *  The source code of Beansdb is most based on Memcachedb and Memcached:
+ *
+ *      http://memcachedb.org/
+ *      http://danga.com/memcached/
+ *
+ *  Copyright 2009 Douban Inc.  All rights reserved.
+ *
+ *  Use and distribution licensed under the BSD license.  See
+ *  the LICENSE file for full text.
+ *
+ *  Authors:
+ *      Davies Liu <davies.liu@gmail.com>
+ *
+ */
+ 
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/time.h>
+#include <netinet/in.h>
+#include <netdb.h>
+
+#define DATA_BUFFER_SIZE 2048
+#define MAX_PAYLOAD_SIZE 1400
+#define MAX_SENDBUF_SIZE (256 * 1024 * 1024)
+/* I'm told the max legnth of a 64-bit num converted to string is 20 bytes.
+ * Plus a few for spaces, \r\n, \0 */
+#define SUFFIX_SIZE 24
+#define INCR_MAX_STORAGE_LEN 24
+
+/** Initial size of list of items being returned by "get". */
+#define ITEM_LIST_INITIAL 200
+
+/** Initial size of the sendmsg() scatter/gather array. */
+#define IOV_LIST_INITIAL 400
+
+/** Initial number of sendmsg() argument structures to allocate. */
+#define MSG_LIST_INITIAL 10
+
+/** High water marks for buffer shrinking */
+#define READ_BUFFER_HIGHWAT 8192
+#define ITEM_LIST_HIGHWAT 400
+#define IOV_LIST_HIGHWAT 600
+#define MSG_LIST_HIGHWAT 100
+
+#define MAX_REP_PRIORITY 1000000
+#define MAX_REP_ACK_POLICY 6
+#define MAX_REP_NSITES 1000
+
+
+#define RGET_MAX_ITEMS 100
+#define PACKAGE "CuttDB"
+#define VERSION "0.1.0"
+
+/* Get a consistent bool type */
+#include <stdbool.h>
+
+#if HAVE_STDINT_H
+# include <stdint.h>
+#else
+ typedef unsigned char             uint8_t;
+#endif
+
+/* unistd.h is here */
+#if HAVE_UNISTD_H
+# include <unistd.h>
+#endif
+
+/* 64-bit Portable printf */
+/* printf macros for size_t, in the style of inttypes.h */
+#ifdef _LP64
+#define __PRIS_PREFIX "z"
+#else
+#define __PRIS_PREFIX
+#endif
+
+#define AE_SETSIZE (1024*60)    /* Max number of fd supported */
+
+#define AE_OK 0
+#define AE_ERR -1
+
+#define AE_NONE 0
+#define AE_READABLE 1
+#define AE_WRITABLE 2
+
+/* Use these macros after a % in a printf format string
+   to get correct 32/64 bit behavior, like this:
+   size_t size = records.size();
+   printf("%"PRIuS"\n", size); */
+
+#define PRIdS __PRIS_PREFIX "d"
+#define PRIxS __PRIS_PREFIX "x"
+#define PRIuS __PRIS_PREFIX "u"
+#define PRIXS __PRIS_PREFIX "X"
+#define PRIoS __PRIS_PREFIX "o"
+
+struct stats {
+    uint32_t      curr_conns;
+    uint32_t      total_conns;
+    uint32_t      conn_structs;
+    uint64_t      get_cmds;
+    uint64_t      set_cmds;
+    uint64_t      delete_cmds;
+    uint64_t      slow_cmds;
+    uint64_t      get_hits;
+    uint64_t      get_misses;
+    time_t        started;          /* when the process was started */
+    uint64_t      bytes_read;
+    uint64_t      bytes_written;
+};
+
+#define MAX_VERBOSITY_LEVEL 2
+
+struct settings {
+    size_t item_buf_size;
+    int maxconns;
+    int port;
+    char *inter;
+    int verbose;
+    float slow_cmd_time;
+    int flush_period;
+    int flush_limit;
+    int num_threads;        /* number of libevent threads to run */
+};
+
+extern struct stats stats;
+extern struct settings settings;
+
+typedef struct _stritem {
+    int             expire;        /* expire time */
+    uint32_t        flag;        /* flag of item */
+    int             nbytes;     /* size of data */
+    uint8_t         nsuffix;    /* length of flags-and-length string */
+    uint8_t         nkey;       /* key length, w/terminating null and padding */
+    void * end[];
+    /* then null-terminated key */
+    /* then " flags length\r\n" (no terminating null) */
+    /* then data with terminating \r\n (no terminating null; it's binary!) */
+} item;
+
+#define ITEM_key(item) ((char*)&((item)->end[0]))
+
+/* warning: don't use these macros with a function, as it evals its arg twice */
+#define ITEM_suffix(item) ((char*) &((item)->end[0]) + (item)->nkey + 1)
+#define ITEM_data(item) ((char*) &((item)->end[0]) + (item)->nkey + 1 + (item)->nsuffix)
+#define ITEM_ntotal(item) (sizeof(struct _stritem) + (item)->nkey + 1 + (item)->nsuffix + (item)->nbytes)
+
+enum conn_states {
+    conn_listening,  /** the socket which listens for connections */
+    conn_read,       /** reading in a command line */
+    conn_write,      /** writing out a simple response */
+    conn_nread,      /** reading in a fixed number of bytes */
+    conn_swallow,    /** swallowing unnecessary bytes w/o storing */
+    conn_closing,    /** closing this connection */
+    conn_mwrite,     /** writing out many items sequentially */
+};
+
+#define NREAD_ADD 1
+#define NREAD_SET 2
+#define NREAD_REPLACE 3
+#define NREAD_APPEND 4
+#define NREAD_PREPEND 5
+
+typedef struct conn conn;
+struct conn {
+    int    sfd;
+    int    state;
+    short  ev_flags;
+
+    char   *rbuf;   /** buffer to read commands into */
+    char   *rcurr;  /** but if we parsed some already, this is where we stopped */
+    int    rsize;   /** total allocated size of rbuf */
+    int    rbytes;  /** how much data, starting from rcur, do we have unparsed */
+
+    char   *wbuf;
+    char   *wcurr;
+    int    wsize;
+    int    wbytes;
+    int    write_and_go; /** which state to go into after finishing current write */
+    void   *write_and_free; /** free this memory after finishing writing */
+    bool   noreply;   /* True if the reply should not be sent. */
+
+    char   *ritem;  /** when we read in an item's value, it goes here */
+    int    rlbytes;
+
+    /* data for the nread state */
+
+    /**
+     * item is used to hold an item structure created after reading the command
+     * line of set/add/replace commands, but before we finished reading the actual
+     * data. The data is read into ITEM_data(item) to avoid extra copying.
+     */
+
+    void   *item;     /* for commands set/add/replace  */
+    int    item_comm; /* which one is it: set/add/replace */
+
+    /* data for the swallow state */
+    int    sbytes;    /* how many bytes to swallow */
+
+    /* data for the mwrite state */
+    struct iovec *iov;
+    int    iovsize;   /* number of elements allocated in iov[] */
+    int    iovused;   /* number of elements used in iov[] */
+
+    struct msghdr *msglist;
+    int    msgsize;   /* number of elements allocated in msglist[] */
+    int    msgused;   /* number of elements used in msglist[] */
+    int    msgcurr;   /* element in msglist[] being transmitted now */
+    int    msgbytes;  /* number of bytes in current msg */
+
+    item   **ilist;   /* list of items to write out */
+    int    isize;
+    item   **icurr;
+    int    ileft;
+
+    conn   *next;     /* Used for generating a list of conn structures */
+};
+
+/*
+ * Functions
+ */
+
+/* item management */
+/*
+void item_init(void);
+item *do_item_from_freelist(void);
+int do_item_add_to_freelist(item *it);
+item *item_alloc1(char *key, const size_t nkey, const int flags, const int nbytes);
+int item_free(item *it);
+item *item_get(char *key, size_t nkey);
+*/
+
+/* conn management */
+conn *do_conn_from_freelist();
+bool do_conn_add_to_freelist(conn *c);
+conn *conn_new(const int sfd, const int init_state, const int read_buffer_size);
+
+int store_item(item *item, int comm);
+
+void thread_init(int nthreads);
+int add_event(int fd, int mask, conn *c);
+void loop_run(int nthreads);
+
+void drive_machine(conn *c);
+
+/* Lock wrappers for cache functions that are called from main loop. */
+conn *mt_conn_from_freelist(void);
+bool mt_conn_add_to_freelist(conn *c);
+item *mt_item_from_freelist(void);
+int mt_item_add_to_freelist(item *it);
+void  mt_stats_lock(void);
+void  mt_stats_unlock(void);
+
+#define conn_from_freelist()        mt_conn_from_freelist()
+#define conn_add_to_freelist(x)     mt_conn_add_to_freelist(x)
+#define item_from_freelist()        mt_item_from_freelist()
+#define item_add_to_freelist(x)     mt_item_add_to_freelist(x)
+#define STATS_LOCK()                mt_stats_lock()
+#define STATS_UNLOCK()              mt_stats_unlock()
+
+extern int daemon_quit;
+
diff --git a/libdap-cuttdb/src/cuttdb.c b/libdap-cuttdb/src/cuttdb.c
new file mode 100644
index 0000000000000000000000000000000000000000..74e342623a5308fd4275868263eaac87c1c726e7
--- /dev/null
+++ b/libdap-cuttdb/src/cuttdb.c
@@ -0,0 +1,21 @@
+/*
+ *   CuttDB - a fast key-value storage engine
+ *
+ *
+ *   http://code.google.com/p/cuttdb/
+ *   
+ *   Copyright (c) 2012, Siyuan Fu.  All rights reserved.
+ *   Use and distribution licensed under the BSD license. 
+ *   See the LICENSE file for full text
+ *
+ *   Author: Siyuan Fu <fusiyuan2010@gmail.com>
+ *
+ */
+
+
+#include "cuttdb.h"
+#include "cdb_types.h"
+#include "cdb_vio.h"
+
+
+/* nothing here */
diff --git a/cuttdb.h b/libdap-cuttdb/src/cuttdb.h
similarity index 100%
rename from cuttdb.h
rename to libdap-cuttdb/src/cuttdb.h
diff --git a/libdap-cuttdb/src/mman.c b/libdap-cuttdb/src/mman.c
new file mode 100644
index 0000000000000000000000000000000000000000..ea5d358adc7768acd227507d7ac9e7853823729d
--- /dev/null
+++ b/libdap-cuttdb/src/mman.c
@@ -0,0 +1,172 @@
+/*
+ * mman-win32 library
+ * https://code.google.com/p/mman-win32/
+ * reinterpreted by Konstantin Papizh <konstantin.papizh@demlabs.net>
+ * DeM Labs Inc.   https://demlabs.net
+ */
+
+#include <windows.h>
+#include <errno.h>
+#include <stdio.h>
+#include "mman.h"
+
+static DWORD __map_mmap_prot_page(const int prot) {
+    DWORD protect = 0;
+    
+    if (prot == PROT_NONE)
+        return protect;
+        
+    if ((prot & PROT_EXEC) != 0) {
+        protect = ((prot & PROT_WRITE) != 0) ? 
+                    PAGE_EXECUTE_READWRITE : PAGE_EXECUTE_READ;
+    } else {
+        protect = ((prot & PROT_WRITE) != 0) ?
+                    PAGE_READWRITE : PAGE_READONLY;
+    }
+    return protect;
+}
+
+static DWORD __map_mmap_prot_file(const int prot) {
+
+    DWORD desiredAccess = 0;
+    if (prot == PROT_NONE)
+        return desiredAccess;
+        
+    if ((prot & PROT_READ) != 0)
+        desiredAccess |= FILE_MAP_READ;
+    if ((prot & PROT_WRITE) != 0)
+        desiredAccess |= FILE_MAP_WRITE;
+    if ((prot & PROT_EXEC) != 0)
+        desiredAccess |= FILE_MAP_EXECUTE;
+    
+    return desiredAccess;
+}
+
+void* mmap(void *addr, size_t len, int prot, int flags, int fildes, offset_t off)
+{
+    HANDLE fm, h;
+    void *map = MAP_FAILED;
+
+    const DWORD dwFileOffsetLow = (sizeof(offset_t) <= sizeof(DWORD)) ?
+                    (DWORD)off : (DWORD)(off & 0xFFFFFFFFL);
+    const DWORD dwFileOffsetHigh = (sizeof(offset_t) <= sizeof(DWORD)) ?
+                    (DWORD)0 : (DWORD)((off >> 32) & 0xFFFFFFFF00000000L);
+    const DWORD protect = __map_mmap_prot_page(prot);
+    const DWORD desiredAccess = __map_mmap_prot_file(prot);
+
+    const offset_t maxSize = off + (offset_t)len;
+
+    const DWORD dwMaxSizeLow = (sizeof(offset_t) <= sizeof(DWORD)) ?
+                    (DWORD)maxSize : (DWORD)(maxSize & 0xFFFFFFFFL);
+    const DWORD dwMaxSizeHigh = (sizeof(offset_t) <= sizeof(DWORD)) ?
+                    (DWORD)0 : (DWORD)((maxSize >> 32) & 0xFFFFFFFF00000000L);
+    _set_errno(0);
+    
+    if (len == 0 || prot == PROT_EXEC) {
+        _set_errno(EINVAL);
+        return MAP_FAILED;
+    }
+    
+    h = ((flags & MAP_ANONYMOUS) == 0) ? 
+                    (HANDLE)_get_osfhandle(fildes) : INVALID_HANDLE_VALUE;
+
+    if ((flags & MAP_ANONYMOUS) == 0 && h == INVALID_HANDLE_VALUE) {
+        _set_errno(EBADF);
+        return MAP_FAILED;
+    }
+
+    fm = CreateFileMapping(h, NULL, protect, dwMaxSizeHigh, dwMaxSizeLow, NULL);
+
+    if (fm == NULL) {
+        int a = errno;
+        _set_errno(GetLastError());
+        a = errno;
+        printf("%d", a);
+        return MAP_FAILED;
+    }
+  
+    if ((flags & MAP_FIXED) == 0) {
+        map = MapViewOfFile(fm, desiredAccess, dwFileOffsetHigh, dwFileOffsetLow, len);
+    }
+    else {
+        map = MapViewOfFileEx(fm, desiredAccess, dwFileOffsetHigh, dwFileOffsetLow, len, addr);
+    }
+    CloseHandle(fm);
+
+    if (map == NULL) {
+        _set_errno(GetLastError());
+        return MAP_FAILED;
+    }
+    return map;
+}
+
+int munmap(void *addr, size_t len) {
+    if (UnmapViewOfFile(addr))
+        return 0;
+        
+    _set_errno(GetLastError());
+    return -1;
+}
+
+int _mprotect(void *addr, size_t len, int prot) {
+    DWORD newProtect = __map_mmap_prot_page(prot);
+    DWORD oldProtect = 0;
+    
+    if (VirtualProtect(addr, len, newProtect, &oldProtect))
+        return 0;
+    _set_errno(GetLastError());
+    return -1;
+}
+
+int msync(void *addr, size_t len, int flags) {
+    if (FlushViewOfFile(addr, len))
+        return 0;
+    _set_errno(GetLastError());
+    return -1;
+}
+
+int mlock(const void *addr, size_t len) {
+    if (VirtualLock((LPVOID)addr, len))
+        return 0;
+    _set_errno(GetLastError());
+    return -1;
+}
+
+int munlock(const void *addr, size_t len) {
+    if (VirtualUnlock((LPVOID)addr, len))
+        return 0;
+    _set_errno(GetLastError());
+    return -1;
+}
+
+ssize_t pread(int fd, void *buf, unsigned long count, offset_t offset) {
+    unsigned long len = 0;
+
+    OVERLAPPED overlapped;
+    memset(&overlapped, 0, sizeof(OVERLAPPED));
+    overlapped.OffsetHigh = (uint32_t)((offset & 0xFFFFFFFF00000000LL) >> 32);
+    overlapped.Offset = (uint32_t)(offset & 0xFFFFFFFFLL);
+
+    HANDLE file = (HANDLE)_get_osfhandle(fd);
+    if ((!ReadFile(file, buf, count, &len, &overlapped)) && GetLastError() != ERROR_HANDLE_EOF) {
+        _set_errno(GetLastError());
+        return -1;
+    }
+    return len;
+}
+
+ssize_t pwrite(int fd, const void *buf, unsigned long count, offset_t offset) {
+    long unsigned int len = 0;
+
+    OVERLAPPED overlapped;
+    memset(&overlapped, 0, sizeof(OVERLAPPED));
+    overlapped.OffsetHigh = (uint32_t)((offset & 0xFFFFFFFF00000000LL) >> 32);
+    overlapped.Offset = (uint32_t)(offset & 0xFFFFFFFFLL);
+
+    HANDLE file = (HANDLE)_get_osfhandle(fd);
+    if (!WriteFile(file, buf, count, &len, &overlapped)) {
+        _set_errno(GetLastError());
+        return -1;
+    }
+    return len;
+}
diff --git a/libdap-cuttdb/src/mman.h b/libdap-cuttdb/src/mman.h
new file mode 100644
index 0000000000000000000000000000000000000000..b8bb8cb78d0fdf69037399dd5d2845b857e08624
--- /dev/null
+++ b/libdap-cuttdb/src/mman.h
@@ -0,0 +1,59 @@
+#ifndef _MMAN_H_
+#define _MMAN_H_
+
+#ifndef _WIN32_WINNT
+#define _WIN32_WINNT 0x0600
+#endif						
+
+#include <_mingw.h>
+#include <stdint.h>
+#include <io.h>
+
+#if defined(_WIN64)
+typedef int64_t offset_t;
+#else
+typedef uint32_t offset_t;
+#endif
+
+#include <sys/types.h>
+#include <stdbool.h>
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define PROT_NONE       0
+#define PROT_READ       1
+#define PROT_WRITE      2
+#define PROT_EXEC       4
+
+#define MAP_FILE        0
+#define MAP_SHARED      1
+#define MAP_PRIVATE     2
+#define MAP_TYPE        0xf
+#define MAP_FIXED       0x10
+#define MAP_ANONYMOUS   0x20
+#define MAP_ANON        MAP_ANONYMOUS
+
+#define MAP_FAILED      ((void *)-1)
+
+#define MS_ASYNC        1
+#define MS_SYNC         2
+#define MS_INVALIDATE   4
+
+#define fdatasync(fd) _commit(fd)
+
+void*   mmap(void *addr, size_t len, int prot, int flags, int fildes, offset_t offset);
+int     munmap(void *addr, size_t len);
+int     _mprotect(void *addr, size_t len, int prot);
+int     msync(void *addr, size_t len, int flags);
+int     mlock(const void *addr, size_t len);
+int     munlock(const void *addr, size_t len);
+
+ssize_t pread(int fd, void *buf, unsigned long count, offset_t offset);
+ssize_t pwrite(int fd, const void *buf, unsigned long count, offset_t offset);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /*  _MMAN_H_ */
diff --git a/libdap-cuttdb/src/server-thread.c b/libdap-cuttdb/src/server-thread.c
new file mode 100644
index 0000000000000000000000000000000000000000..c7a05c30319e63a177178ab43a84bc7b5435fb11
--- /dev/null
+++ b/libdap-cuttdb/src/server-thread.c
@@ -0,0 +1,217 @@
+/*
+ *   CuttDB - a fast key-value storage engine
+ *
+ *
+ *   http://code.google.com/p/cuttdb/
+ *   
+ *   The server&network part of CuttDB is based on Beansdb:
+ *
+ *   http://beansdb.googlecode.com
+ *
+ *   Beansdb is most based on Memcachedb and Memcached:
+ *
+ *   http://memcachedb.org/
+ *   http://danga.com/memcached/
+ *
+ *   Copyright (c) 2012, Siyuan Fu.  All rights reserved.
+ *   Use and distribution licensed under the BSD license. 
+ *   See the LICENSE file for full text
+ *
+ *   Author: Siyuan Fu <fusiyuan2010@gmail.com>
+ *
+ */
+
+#include "cuttdb-server.h"
+#include <stdio.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <assert.h>
+#include <string.h>
+#include <unistd.h>
+
+#ifdef HAVE_MALLOC_H
+#include <malloc.h>
+#endif
+
+#ifdef HAVE_STRING_H
+#include <string.h>
+#endif
+
+#include <pthread.h>
+
+typedef struct EventLoop {
+//    int   maxfd;
+    conn* conns[AE_SETSIZE];
+    int   fired[AE_SETSIZE];
+    int   nready;
+    void *apidata;
+} EventLoop;
+
+/* Lock for connection freelist */
+static pthread_mutex_t conn_lock;
+
+/* Lock for item buffer freelist */
+static pthread_mutex_t ibuffer_lock;
+
+static EventLoop loop;
+static pthread_mutex_t leader;
+
+/*
+ * Pulls a conn structure from the freelist, if one is available.
+ */
+conn *mt_conn_from_freelist() {
+    conn *c;
+    pthread_mutex_lock(&conn_lock);
+    c = do_conn_from_freelist();
+    pthread_mutex_unlock(&conn_lock);
+    return c;
+}
+
+/*
+ * Adds a conn structure to the freelist.
+ *
+ * Returns 0 on success, 1 if the structure couldn't be added.
+ */
+bool mt_conn_add_to_freelist(conn *c) {
+    bool result;
+
+    pthread_mutex_lock(&conn_lock);
+    result = do_conn_add_to_freelist(c);
+    pthread_mutex_unlock(&conn_lock);
+
+    return result;
+}
+
+
+/******************************* GLOBAL STATS ******************************/
+
+void mt_stats_lock() {
+}
+
+void mt_stats_unlock() {
+}
+
+/* Include the best multiplexing layer supported by this system.
+ * The following should be ordered by performances, descending. */
+#ifdef HAVE_EPOLL
+#include "ae_epoll.c"
+#else
+    #ifdef HAVE_KQUEUE
+    #include "ae_kqueue.c"
+    #else
+    #include "ae_select.c"
+    #endif
+#endif
+
+/*
+ * Initializes the thread subsystem, creating various worker threads.
+ *
+ * nthreads  Number of event handler threads to spawn
+ */
+void thread_init(int nthreads) {
+    pthread_mutex_init(&ibuffer_lock, NULL);
+    pthread_mutex_init(&conn_lock, NULL);
+    pthread_mutex_init(&leader, NULL);
+    
+    memset(&loop, 0, sizeof(loop));
+    if (aeApiCreate(&loop) == -1) {
+        exit(1);
+    }
+}
+
+int add_event(int fd, int mask, conn *c)
+{
+    if (fd >= AE_SETSIZE) {
+        fprintf(stderr, "fd is too large: %d\n", fd);
+        return AE_ERR;
+    }
+    assert(loop.conns[fd] == NULL);
+    loop.conns[fd] = c;
+    if (aeApiAddEvent(&loop, fd, mask) == -1){
+        loop.conns[fd] = NULL;
+        return AE_ERR;
+    }
+//    if (fd > loop.maxfd)
+//        loop.maxfd = fd;
+    return AE_OK;
+}
+
+int update_event(int fd, int mask, conn *c)
+{
+    loop.conns[fd] = c;
+    if (aeApiUpdateEvent(&loop, fd, mask) == -1){
+        loop.conns[fd] = NULL;
+        return AE_ERR;
+    }
+    return AE_OK;
+}
+
+int delete_event(int fd)
+{
+    if (fd >= AE_SETSIZE) return -1;
+    if (loop.conns[fd] == NULL) return 0;
+    if (aeApiDelEvent(&loop, fd) == -1)
+        return -1;
+    loop.conns[fd] = NULL;
+    return 0;
+}
+
+static void *worker_main(void *arg) {
+    pthread_setcanceltype (PTHREAD_CANCEL_ASYNCHRONOUS, 0);
+    
+    struct timeval tv = {1, 0};
+    while (!daemon_quit) {
+        pthread_mutex_lock(&leader);
+
+AGAIN:
+        while(loop.nready == 0 && daemon_quit == 0)
+            loop.nready = aeApiPoll(&loop, &tv);
+        if (daemon_quit) {
+            pthread_mutex_unlock(&leader);
+            break;
+        }
+       
+        loop.nready --;
+        int fd = loop.fired[loop.nready];
+        conn *c = loop.conns[fd];
+        if (c == NULL){
+            fprintf(stderr, "Bug: conn %d should not be NULL\n", fd);
+            close(fd);
+            goto AGAIN;
+        }
+        loop.conns[fd] = NULL; 
+        pthread_mutex_unlock(&leader);
+        
+        drive_machine(c);
+        if (c->ev_flags > 0) {
+            update_event(fd, c->ev_flags, c);
+        }
+    }
+    return NULL; 
+}
+
+void loop_run(int nthread)
+{
+    int i, ret;
+    pthread_attr_t  attr;
+    pthread_attr_init(&attr);
+    pthread_t* tids = malloc(sizeof(pthread_t) * nthread);
+    
+    for (i=0; i<nthread - 1; i++) {
+        if ((ret = pthread_create(tids + i, &attr, worker_main, NULL)) != 0) {
+            fprintf(stderr, "Can't create thread: %s\n",
+                    strerror(ret));
+            exit(1);
+        }
+    }
+    
+    worker_main(NULL);
+    
+    // wait workers to stop
+    for (i=0; i<nthread - 1; i++) {
+        (void) pthread_join(tids[i], NULL);
+    }
+    free(tids);
+}
+
diff --git a/libdap-cuttdb/src/test_mt.c b/libdap-cuttdb/src/test_mt.c
new file mode 100644
index 0000000000000000000000000000000000000000..de4d383731a4a66bc2e690f40dfc742ec022073e
--- /dev/null
+++ b/libdap-cuttdb/src/test_mt.c
@@ -0,0 +1,149 @@
+/*
+ *   CuttDB - a fast key-value storage engine
+ *
+ *
+ *   http://code.google.com/p/cuttdb/
+ *   
+ *   Copyright (c) 2012, Siyuan Fu.  All rights reserved.
+ *   Use and distribution licensed under the BSD license. 
+ *   See the LICENSE file for full text
+ *
+ *   Author: Siyuan Fu <fusiyuan2010@gmail.com>
+ *
+ */
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <unistd.h>
+#include <pthread.h>
+#include "cuttdb.h"
+
+
+CDB *db;
+
+enum {
+    SETOP,
+    GETOP,
+    DELOP,
+};
+
+#if 1
+static int prob_table1[8] = {SETOP, SETOP, SETOP, SETOP, SETOP, SETOP, DELOP, GETOP};
+static int prob_table2[8] = {SETOP, SETOP, SETOP, SETOP, SETOP, DELOP, DELOP, GETOP};
+static int prob_table3[8] = {SETOP, SETOP, SETOP, DELOP, DELOP, DELOP, DELOP, GETOP};
+#else
+static int prob_table1[8] = {SETOP, SETOP, SETOP, SETOP, SETOP, SETOP, SETOP, GETOP};
+static int prob_table2[8] = {SETOP, SETOP, SETOP, SETOP, SETOP, SETOP, SETOP, GETOP};
+static int prob_table3[8] = {SETOP, SETOP, SETOP, SETOP, SETOP, SETOP, SETOP, GETOP};
+#endif
+int *optable = NULL;
+
+
+long get_rand() 
+{
+    return (long)rand() * RAND_MAX + rand();
+}
+
+
+void *test_thread(void *arg)
+{
+    char key[64];
+    char value[128];
+    void *v;
+    int knum = *(int*)arg;
+    while(1) {
+        int krand = get_rand() % knum;
+        int ksize = snprintf(key, 64, "%ld%ld%ld", krand, krand, krand);
+        int vsize = snprintf(value, 128, "%ld%ld%ld%ld%d%ld%ld%ld%ld",
+                krand, krand, krand, krand, krand, krand, krand, krand);
+        int op = optable[rand() & 0x07];
+        int expire = 600 + 20 * (rand() % 1000);
+        switch(op) {
+            case SETOP:
+                if (cdb_set2(db, key, ksize, value, vsize, CDB_OVERWRITE | CDB_INSERTCACHE, expire) < 0)
+                    printf("ERROR! %s:%d\n", __FILE__, __LINE__);
+                break;
+            case GETOP:
+                if (cdb_get(db, key, ksize, &v, &vsize) == -1)
+                    printf("ERROR! %s:%d\n", __FILE__, __LINE__);
+                if (v)
+                    cdb_free_val(&v);
+                break;
+            case DELOP:
+                if (cdb_del(db, key, ksize) == -1)
+                    printf("ERROR! %s:%d\n", __FILE__, __LINE__);
+                break;
+            default:
+                break;
+        }
+    }
+}
+
+
+
+int main(int argc, char *argv[])
+{
+    int thread_num = 2;
+    int record_num = 10000000;
+    char *db_path = NULL;
+    printf("Usage: %s db_path [record_num] [thread_num]\n", argv[0]);
+    if (argc >= 2)
+        db_path = argv[1];
+    else
+        return -1;
+
+    if (argc >= 3)
+        record_num = atoi(argv[2]);
+    if (argc >= 4)
+        thread_num = atoi(argv[3]);
+
+    record_num = record_num < 100? 100: record_num;
+    thread_num = thread_num < 1? 1: thread_num;
+    srand(time(NULL));
+
+    db = cdb_new();
+    cdb_option(db, record_num / 100, 0, 1024000);
+    if (cdb_open(db, db_path, CDB_CREAT | CDB_TRUNC) < 0) {
+        printf("DB Open err\n");
+        return -1;
+    }
+
+
+    optable = prob_table1;
+    pthread_t threads[thread_num];
+    for(int i = 0; i < thread_num; i++) {
+        pthread_create(&threads[i], NULL, test_thread, &record_num);
+    }
+
+    int clear_interval = 0;
+    while(1) {
+        CDBSTAT st;
+        cdb_stat(db, &st);
+        printf("rnum: %lu, rcnum: %lu, pnum: %lu, pcnum %lu, rlatcy: %u  wlatcy: %u"
+                " rh/m: %lu/%lu ph/m: %lu/%lu\n",
+                st.rnum, st.rcnum, st.pnum, st.pcnum, st.rlatcy, st.wlatcy,
+                st.rchit, st.rcmiss, st.pchit, st.pcmiss);
+        if (++clear_interval % 20 == 0)
+            cdb_stat(db, NULL);
+
+        if (st.rnum > 0.7 * record_num)
+            optable = prob_table2;
+        if (st.rnum > 0.9 * record_num)
+            optable = prob_table3;
+
+        if (st.rnum < 0.8 * record_num)
+            optable = prob_table2;
+
+        if (st.rnum < 0.6 * record_num)
+            optable = prob_table1;
+        fflush(stdout);
+        sleep(1);
+    }
+    
+    return 0;
+}
+
+
+
diff --git a/libdap-cuttdb/src/vio_apnd2.c b/libdap-cuttdb/src/vio_apnd2.c
new file mode 100644
index 0000000000000000000000000000000000000000..3f093a6fb55150cc0c7ac927f042a9cf0fc404aa
--- /dev/null
+++ b/libdap-cuttdb/src/vio_apnd2.c
@@ -0,0 +1,2647 @@
+/*
+ *   CuttDB - a fast key-value storage engine
+ *
+ *
+ *   http://code.google.com/p/cuttdb/
+ *
+ *   Copyright (c) 2012, Siyuan Fu.  All rights reserved.
+ *   Use and distribution licensed under the BSD license.
+ *   See the LICENSE file for full text
+ *
+ *   Author: Siyuan Fu <fusiyuan2010@gmail.com>
+ *
+ */
+
+
+#include "vio_apnd2.h"
+#include "cdb_hashtable.h"
+#include "cdb_bgtask.h"
+#include "cdb_lock.h"
+#include "cuttdb.h"
+#include "cdb_core.h"
+#include "cdb_errno.h"
+#include "cdb_types.h"
+#include "cdb_crc64.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <time.h>
+#include <sys/stat.h>
+#ifdef _WIN32
+#include "mman.h"
+#else
+#include <sys/mman.h>
+#endif
+#include <fcntl.h>
+#include <unistd.h>
+#include <dirent.h>
+
+/* record magic bytes */
+#define RECMAGIC 0x19871022
+/* obsoleted, but appeared in some code */
+#define DELRECMAGIC 0x19871023
+#define PAGEMAGIC 0x19890604
+
+/* data buffered before pwrite to disk */
+#define IOBUFSIZE (2 * MB)
+/* structure of deletion buffer differs from the others, buffered DELBUFMAX records at most */
+#define DELBUFMAX 10000
+
+/* index(page) file size limit */
+#define FIDXMAXSIZE (16 * MB)
+/* data file size limit */
+#define FDATMAXSIZE (128 * MB)
+/* all meta information are regulated to fix size */
+#define FILEMETASIZE 64
+/* the file opened simultaneously limit, managed by LRU */
+#define MAXFD 16384
+#define MAX_PATH_LEN 255
+
+#define FILEMAGICHEADER "CuTtDbFiLePaRtIaL"
+#define FILEMAGICLEN (strlen(FILEMAGICHEADER))
+/* page or data records are stored at aligned offset */
+#define ALIGNBYTES 16
+
+/* virtual offset(48bits) transform into real offset(fid,offset) */
+#define VOFF2ROFF(off, fid, roff) do{fid = (off).i4 >> 8; \
+    roff = ((off).i4 & 0xff) << 16; roff = (roff | (off).i2) * ALIGNBYTES;}while(0)
+
+/* real offset transform into virtual offset */
+#define ROFF2VOFF(fid, roff, off) do{(off).i4 = fid << 8; \
+    (off).i4 |= (roff / ALIGNBYTES) >> 16; (off).i2 = (roff / ALIGNBYTES) & 0xffff;} while(0)
+
+/* align to a integer offset */
+#define OFFALIGNED(off) ((((off)-1) | (ALIGNBYTES - 1)) + 1)
+
+/* used in fd LRU-cached, distinguish index or data files' fd */
+#define VFIDIDX(fid) (fid * 2)
+#define VFIDDAT(fid) (fid * 2 + 1)
+
+/* how often write out buffered data */
+#define FLUSHTIMEOUT 5
+/* how often to check if index file needs space recycle */
+#define RCYLEPAGEINTERVAL 60
+/* how often to check if data file needs space recycle */
+#define RCYLEDATAINTERVAL 120
+/* data file space recycle check interval factor (seconds per data file/128MB)*/
+#define DATARCYLECHECKFACTOR 1800
+
+
+/* three type of file */
+enum {
+    /* random value */
+    VIOAPND2_INDEX = 0x97,
+    VIOAPND2_DATA = 0x98,
+    VIOAPND2_DELLOG = 0x99,
+};
+
+
+/* where the record comes from when calling writerec */
+enum {
+    VIOAPND2_RECEXTERNAL = 0,
+    VIOAPND2_RECINTERNAL = 1,
+};
+
+
+/* a file is writing or full? */
+enum {
+    VIOAPND2_WRITING = 0,
+    VIOAPND2_FULL = 1,
+};
+
+/* signature in the header file, indicates it's open or be safety closed */
+enum {
+    /* any number doens't matter */
+    VIOAPND2_SIGOPEN = 2,
+    VIOAPND2_SIGCLOSED = 3,
+};
+
+
+/* buffer for IO */
+typedef struct {
+    uint32_t limit;
+    uint32_t off;
+    uint32_t pos;
+    uint32_t fid;
+    uint64_t oid;
+    int fd;
+    char buf[IOBUFSIZE];
+} VIOAPND2IOBUF;
+
+
+/* file information for every file */
+typedef struct VIOAPND2FINFO {
+    /* fid */
+    uint32_t fid;
+    /* first oid */
+    uint64_t oidf;
+    /* last oid */
+    uint64_t oidl;
+
+    /* next file */
+    struct VIOAPND2FINFO *fnext;
+    /* prev file */
+    struct VIOAPND2FINFO *fprev;
+
+    uint32_t fsize;
+    /* junk space */
+    uint32_t rcyled;
+    /* nearest expire time */
+    uint32_t nexpire;
+    /* last time for recycle check */
+    uint32_t lcktime;
+    /* index page file or data file? */
+    uint8_t ftype;
+    /* writing or full? */
+    uint8_t fstatus;
+    /* ref count, avoid unlink failure */
+    uint32_t ref;
+    /* whether unlink the file after dereference */
+    bool unlink;
+} VIOAPND2FINFO;
+
+
+typedef struct {
+    /* a new db? */
+    bool create;
+    /* fd number limit */
+    int maxfds;
+    /* opened files' fds cache */
+    CDBHASHTABLE *fdcache;
+
+    /* number of data file */
+    uint32_t dfnum;
+    /* number of index file */
+    uint32_t ifnum;
+
+    /* Buffers */
+    VIOAPND2IOBUF dbuf;
+    VIOAPND2IOBUF ibuf;
+    FOFF delbuf[DELBUFMAX];
+    int delbufpos;
+
+    /* db path */
+    char *filepath;
+
+
+    /* file information of index files */
+    CDBHASHTABLE *idxmeta;
+    VIOAPND2FINFO *idxfhead;
+    VIOAPND2FINFO *idxftail;
+    /* file information of data files */
+    CDBHASHTABLE *datmeta;
+    VIOAPND2FINFO *datfhead;
+    VIOAPND2FINFO *datftail;
+
+    /* fd for db header */
+    int hfd;
+    /* fd for files meta header */
+    int mfd;
+    /* fd for deletion log */
+    int dfd;
+
+    /* lock for all I/O operation */
+    CDBLOCK *lock;
+
+    int idxitfid;
+    uint32_t idxitoff;
+    char *idxmmap;
+
+} VIOAPND2;
+
+
+/* iterator for index/data */
+typedef struct {
+    /* current open fd */
+    int fd;
+    /* current offset in file*/
+    uint32_t off;
+    /* current operation id */
+    uint64_t oid;
+    /* current file size*/
+    uint64_t fsize;
+    /* mapped of file */
+    char *mmap;
+    /* reference of filemeta struct */
+    VIOAPND2FINFO *finfo;
+} VIOAPND2ITOR;
+
+
+static int _vio_apnd2_open(CDBVIO *vio, const char *filepath, int flags);
+static int _vio_apnd2_checkpid(CDBVIO *vio);
+static int _vio_apnd2_write(CDBVIO *vio, int fd, void *buf, uint32_t size, bool aligned);
+static int _vio_apnd2_read(CDBVIO *vio, int fd, void *buf, uint32_t size, uint64_t off);
+static int _vio_apnd2_readmeta(CDBVIO *vio, bool overwrite);
+static int _vio_apnd2_writemeta(CDBVIO *vio);
+static int _vio_apnd2_close(CDBVIO *vio);
+static int _vio_apnd2_writerec(CDBVIO *vio, CDBREC *rec, FOFF *off, int ptrtype);
+static int _vio_apnd2_writerecexternal(CDBVIO *vio, CDBREC *rec, FOFF *off);
+static int _vio_apnd2_writerecinternal(CDBVIO *vio, CDBREC *rec, FOFF *off);
+static int _vio_apnd2_deleterec(CDBVIO *vio, CDBREC *rec, FOFF off);
+static int _vio_apnd2_readrec(CDBVIO *vio, CDBREC** rec, FOFF off, bool readval);
+static int _vio_apnd2_writepage(CDBVIO *vio, CDBPAGE *page, FOFF *off);
+static int _vio_apnd2_readpage(CDBVIO *vio, CDBPAGE **page, FOFF off);
+static int _vio_apnd2_sync(CDBVIO *vio);
+static int _vio_apnd2_writehead2(CDBVIO *vio);
+static int _vio_apnd2_writehead(CDBVIO *vio, bool wtable);
+static int _vio_apnd2_readhead2(CDBVIO *vio);
+static int _vio_apnd2_readhead(CDBVIO *vio, bool rtable);
+static int _vio_apnd2_writefmeta(CDBVIO *vio, int fd, VIOAPND2FINFO *finfo);
+static int _vio_apnd2_readfmeta(CDBVIO *vio, int fd, VIOAPND2FINFO *finfo);
+static int _vio_apnd2_flushbuf(CDBVIO *vio, int dtype);
+static void _vio_apnd2_flushtask(void *arg);
+static void _vio_apnd2_rcyledataspacetask(void *arg);
+static void _vio_apnd2_fixcachepageooff(CDB *db, uint32_t bit, FOFF off);
+static void _vio_apnd2_rcylepagespacetask(void *arg);
+static int _vio_apnd2_shiftnew(CDBVIO *vio, int dtype);
+static int _vio_apnd2_recovery(CDBVIO *vio, bool force);
+static void _vio_apnd2_unlink(CDBVIO *vio, VIOAPND2FINFO *finfo, int dtype);
+static VIOAPND2FINFO* _vio_apnd2_fileiternext(CDBVIO *vio, int dtype, uint64_t oid);
+static int _vio_apnd2_iterfirst(CDBVIO *vio, VIOAPND2ITOR *it, int dtype, int64_t oid);
+static int _vio_apnd2_iterfree(CDBVIO *vio, int dtype, VIOAPND2ITOR *it);
+static int _vio_apnd2_pageiternext(CDBVIO *vio, CDBPAGE **page, void *iter);
+static int _vio_apnd2_reciternext(CDBVIO *vio, CDBREC **rec, void *iter);
+static void* _vio_apnd2_reciterfirst(CDBVIO *vio, uint64_t oid);
+static void* _vio_apnd2_pageiterfirst(CDBVIO *vio, uint64_t oid);
+static void _vio_apnd2_reciterdestory(CDBVIO *vio, void *iter);
+static void _vio_apnd2_pageiterdestory(CDBVIO *vio, void *iter);
+static void _vio_apnd2_cleanpoint(CDBVIO *vio);
+static int _vio_apnd2_cmpfuncsreorder(const void *p1, const void *p2);
+static int _vio_apnd2_checkopensig(CDBVIO *vio);
+static int _vio_apnd2_setopensig(CDBVIO *vio, int sig);
+static int _vio_apnd2_rcyledatafile(CDBVIO *vio, VIOAPND2FINFO *finfo, bool rcyle);
+
+
+/* hook the io methods */
+void vio_apnd2_init(CDBVIO *vio)
+{
+    vio->close = _vio_apnd2_close;
+    vio->open = _vio_apnd2_open;
+    vio->rpage = _vio_apnd2_readpage;
+    vio->wpage = _vio_apnd2_writepage;
+    vio->rrec = _vio_apnd2_readrec;
+    vio->drec = _vio_apnd2_deleterec;
+    vio->wrec = _vio_apnd2_writerecexternal;
+    vio->sync = _vio_apnd2_sync;
+    vio->rhead = _vio_apnd2_readhead2;
+    vio->whead = _vio_apnd2_writehead2;
+    vio->cleanpoint = _vio_apnd2_cleanpoint;
+    vio->pageitfirst = _vio_apnd2_pageiterfirst;
+    vio->pageitnext = _vio_apnd2_pageiternext;
+    vio->pageitdestroy = _vio_apnd2_pageiterdestory;
+    vio->recitfirst = _vio_apnd2_reciterfirst;
+    vio->recitnext = _vio_apnd2_reciternext;
+    vio->recitdestroy = _vio_apnd2_reciterdestory;
+}
+
+/* the hash table used in VIOAPND2 need not rehash, just use the key id is OK */
+static uint32_t _directhash(const void *key, int size)
+{
+    return *(uint32_t*)key;
+}
+
+
+/* allocate a new VIOAPND2 object, called when open db */
+static void _vio_apnd2_new(CDBVIO *vio)
+{
+    VIOAPND2 *myio = (VIOAPND2 *)malloc(sizeof(VIOAPND2));
+
+    myio->dfnum = myio->ifnum = 0;
+
+    myio->dbuf.fid = 0;
+    myio->dbuf.pos = 0;
+    myio->dbuf.off = 0;
+    myio->dbuf.oid = 0;
+    memset(myio->dbuf.buf, 0, IOBUFSIZE);
+    myio->idxfhead = NULL;
+    myio->idxftail = NULL;
+
+    myio->ibuf.fid = 0;
+    myio->ibuf.pos = 0;
+    myio->ibuf.off = 0;
+    myio->ibuf.oid = 0;
+    memset(myio->ibuf.buf, 0, IOBUFSIZE);
+    myio->datfhead = NULL;
+    myio->datftail = NULL;
+
+    myio->delbufpos = 0;
+
+    myio->ifnum = 0;
+    myio->dfnum = 0;
+
+    myio->mfd = -1;
+    myio->hfd = -1;
+    myio->dfd = -1;
+
+    myio->fdcache = cdb_ht_new(true, _directhash);
+    /* the following two are look-up table, need not LRU */
+    myio->idxmeta = cdb_ht_new(false, _directhash);
+    myio->datmeta = cdb_ht_new(false, _directhash);
+
+    myio->lock = cdb_lock_new(CDB_LOCKMUTEX);
+
+    myio->create = true;
+    myio->maxfds = MAXFD;
+    myio->filepath = NULL;
+
+    vio->iometa = myio;
+}
+
+
+/* free a VIOAPND2 object, called when close db */
+static void _vio_apnd2_destroy(CDBVIO *vio)
+{
+    VIOAPND2 *myio = (VIOAPND2 *)vio->iometa;
+    cdb_ht_destroy(myio->fdcache);
+    cdb_ht_destroy(myio->idxmeta);
+    cdb_ht_destroy(myio->datmeta);
+    cdb_lock_destory(myio->lock);
+    if (myio->filepath)
+        free(myio->filepath);
+    free(myio);
+    vio->iometa = NULL;
+}
+
+/* check if another process has already open the current db */
+static int _vio_apnd2_checkpid(CDBVIO *vio)
+{
+    VIOAPND2 *myio = (VIOAPND2 *)vio->iometa;
+    struct stat st;
+    char filename[MAX_PATH_LEN] = {0};
+    char syspidpath[MAX_PATH_LEN] = {0};
+    snprintf(filename, MAX_PATH_LEN, "%s/pid.cdb", myio->filepath);
+
+    if (stat(filename, &st) == 0) {
+        /* pid file exist */
+        FILE *f = fopen(filename, "rt");
+        int pid = -1;
+        if (f == NULL) {
+            cdb_seterrno(vio->db, CDB_OPENERR, __FILE__, __LINE__);
+            return -1;
+        }
+
+        int ret = fscanf(f, "%d", &pid);
+        fclose(f);
+        if (ret != 1) {
+            cdb_seterrno(vio->db, CDB_PIDEXIST, __FILE__, __LINE__);
+            return -1;
+        }
+
+        /* check if the process still alive */
+        snprintf(syspidpath, MAX_PATH_LEN, "/proc/%d", pid);
+        if (stat(syspidpath, &st) == 0) {
+            cdb_seterrno(vio->db, CDB_PIDEXIST, __FILE__, __LINE__);
+            return -1;
+        }
+    }
+
+    /* pid file non-exist or obsoleted */
+    FILE *f = fopen(filename, "wt");
+    if (f == NULL) {
+        cdb_seterrno(vio->db, CDB_OPENERR, __FILE__, __LINE__);
+        return -1;
+    }
+    fprintf(f, "%d\n", getpid());
+    fclose(f);
+    return 0;
+}
+
+/* open an db by path and mode */
+static int _vio_apnd2_open(CDBVIO *vio, const char *filepath, int flags)
+{
+    int rflags = O_RDWR;
+    char filename[MAX_PATH_LEN] = {0};
+    int fsize;
+    int sigstatus;
+    VIOAPND2 *myio;
+
+    _vio_apnd2_new(vio);
+    myio = (VIOAPND2 *)vio->iometa;
+    myio->filepath = strdup(filepath);
+
+    if (flags & CDB_CREAT)
+        rflags |= O_CREAT;
+    if (flags & CDB_TRUNC)
+        rflags |= O_TRUNC;
+
+    if (_vio_apnd2_checkpid(vio) < 0) {
+        goto ERRRET;
+    }
+
+    snprintf(filename, MAX_PATH_LEN, "%s/mainindex.cdb", myio->filepath);
+    myio->hfd = open(filename, rflags, 0644);
+    if (myio->hfd < 0 && errno == ENOENT && (rflags & O_CREAT)) {
+        /* try to create, but path not exists */
+        cdb_seterrno(vio->db, CDB_DIRNOEXIST, __FILE__, __LINE__);
+        goto ERRRET;
+    } else if (myio->hfd < 0) {
+        /* other open error */
+        cdb_seterrno(vio->db, CDB_OPENERR, __FILE__, __LINE__);
+        goto ERRRET;
+    }
+
+    fsize = lseek(myio->hfd, 0, SEEK_END);
+    if (fsize) {
+        myio->create = false;
+        sigstatus = _vio_apnd2_checkopensig(vio);
+        if (sigstatus < 0) {
+            /* main table read error */
+            cdb_seterrno(vio->db, CDB_READERR, __FILE__, __LINE__);
+            goto ERRRET;
+        }
+    } else {
+        sigstatus = VIOAPND2_SIGCLOSED;
+    }
+
+    /* */
+    struct stat st;
+    snprintf(filename, MAX_PATH_LEN, "%s/force_recovery", myio->filepath);
+    if (stat(filename, &st) == 0) {
+        /* special file exist, force recovery to fix the database */
+        _vio_apnd2_recovery(vio, true);
+        unlink(filename);
+    }  else if (sigstatus == VIOAPND2_SIGOPEN) {
+        /* didn't properly closed last time */
+        _vio_apnd2_recovery(vio, false);
+    } else if (sigstatus != VIOAPND2_SIGCLOSED) {
+        cdb_seterrno(vio->db, CDB_DATAERRMETA, __FILE__, __LINE__);
+        goto ERRRET;
+    }
+
+    if (_vio_apnd2_setopensig(vio, VIOAPND2_SIGOPEN) < 0) {
+        cdb_seterrno(vio->db, CDB_WRITEERR, __FILE__, __LINE__);
+        goto ERRRET;
+    }
+
+    snprintf(filename, MAX_PATH_LEN, "%s/mainmeta.cdb", myio->filepath);
+    myio->mfd = open(filename, rflags, 0644);
+    if (myio->mfd < 0) {
+        cdb_seterrno(vio->db, CDB_OPENERR, __FILE__, __LINE__);
+        goto ERRRET;
+    }
+
+    fsize = lseek(myio->mfd, 0, SEEK_END);
+    if (fsize) {
+        /* exist database */
+        _vio_apnd2_readmeta(vio, false);
+
+        /* open current data file and index file for buffer */
+        snprintf(filename, MAX_PATH_LEN, "%s/idx%08d.cdb", myio->filepath, myio->ibuf.fid);
+        myio->ibuf.fd = open(filename, rflags, 0644);
+        myio->ibuf.limit = CDBMIN(IOBUFSIZE, FIDXMAXSIZE - myio->ibuf.off);
+        myio->ibuf.pos = 0;
+
+        snprintf(filename, MAX_PATH_LEN, "%s/dat%08d.cdb", myio->filepath, myio->dbuf.fid);
+        myio->dbuf.fd = open(filename, rflags, 0644);
+        myio->dbuf.limit = CDBMIN(IOBUFSIZE, FDATMAXSIZE - myio->dbuf.off);
+        myio->dbuf.pos = 0;
+    } else {
+        /* new database */
+        myio->create = true;
+        /* remember the bnum */
+        _vio_apnd2_writehead(vio, false);
+        _vio_apnd2_shiftnew(vio, VIOAPND2_INDEX);
+        _vio_apnd2_shiftnew(vio, VIOAPND2_DATA);
+    }
+
+    snprintf(filename, MAX_PATH_LEN, "%s/dellog.cdb", myio->filepath);
+    myio->dfd = open(filename, O_RDWR | O_CREAT | O_TRUNC, 0644);
+    if (myio->dfd < 0) {
+        cdb_seterrno(vio->db, CDB_OPENERR, __FILE__, __LINE__);
+        goto ERRRET;
+    }
+
+    /* set background tasks, flush buffer and recycle space */
+    cdb_bgtask_add(vio->db->bgtask, _vio_apnd2_flushtask, vio, FLUSHTIMEOUT);
+    cdb_bgtask_add(vio->db->bgtask, _vio_apnd2_rcylepagespacetask, vio, RCYLEPAGEINTERVAL);
+    cdb_bgtask_add(vio->db->bgtask, _vio_apnd2_rcyledataspacetask, vio, RCYLEDATAINTERVAL);
+    return 0;
+
+ERRRET:
+    if (myio->mfd > 0)
+        close(myio->mfd);
+    if (myio->hfd > 0)
+        close(myio->hfd);
+    if (myio->dfd > 0)
+        close(myio->dfd);
+    _vio_apnd2_destroy(vio);
+    return -1;
+}
+
+
+/* task for flush buffer */
+static void _vio_apnd2_flushtask(void *arg)
+{
+    CDBVIO *vio = (CDBVIO *)arg;
+    VIOAPND2 *myio = (VIOAPND2 *)vio->iometa;
+    cdb_lock_lock(myio->lock);
+    _vio_apnd2_flushbuf(vio, VIOAPND2_DATA);
+    _vio_apnd2_flushbuf(vio, VIOAPND2_INDEX);
+    _vio_apnd2_flushbuf(vio, VIOAPND2_DELLOG);
+    cdb_lock_unlock(myio->lock);
+}
+
+
+/* read information for db files, 'overwrite' indicates recovery */
+static int _vio_apnd2_readmeta(CDBVIO *vio, bool overwrite)
+{
+    VIOAPND2 *myio = (VIOAPND2 *)vio->iometa;
+    char buf[FILEMETASIZE];
+    char *hbuf;
+    int hbufsize;
+    int pos = 0;
+
+    if (pread(myio->mfd, buf, FILEMETASIZE, 0) != FILEMETASIZE) {
+        if (overwrite)
+            return 0;
+        cdb_seterrno(vio->db, CDB_READERR, __FILE__, __LINE__);
+        return -1;
+    }
+
+    if (memcmp(buf, FILEMAGICHEADER, FILEMAGICLEN) != 0) {
+        cdb_seterrno(vio->db, CDB_DATAERRMETA, __FILE__, __LINE__);
+        return -1;
+    }
+
+    pos += FILEMAGICLEN;
+    cdb_lock_lock(myio->lock);
+    if (!overwrite)
+        myio->ibuf.off = *(uint32_t*)(buf + pos);
+    pos += SI4;
+    myio->ibuf.limit = *(uint32_t*)(buf + pos);
+    pos += SI4;
+    if (!overwrite)
+        myio->dbuf.off = *(uint32_t*)(buf + pos);
+    pos += SI4;
+    myio->dbuf.limit = *(uint32_t*)(buf + pos);
+    pos += SI4;
+    if (!overwrite)
+        myio->ifnum = *(uint32_t*)(buf + pos);
+    pos += SI4;
+    if (!overwrite)
+        myio->dfnum = *(uint32_t*)(buf + pos);
+    pos += SI4;
+    if (!overwrite)
+        myio->ibuf.fid = *(uint32_t*)(buf + pos);
+    pos += SI4;
+    if (!overwrite)
+        myio->dbuf.fid = *(uint32_t*)(buf + pos);
+    pos += SI4;
+
+    hbufsize = (SI4 + SI4 + SI4 + SI8 + SI8 + 1 + 1) * myio->ifnum;
+    hbufsize += (SI4 + SI4 + SI4 + SI4 + SI8 + SI8 + 1 + 1) * myio->dfnum;
+    hbuf = (char*)malloc(hbufsize);
+    pos = 0;
+
+    if (pread(myio->mfd, hbuf, hbufsize, FILEMETASIZE) != hbufsize) {
+        cdb_lock_unlock(myio->lock);
+        free(hbuf);
+        if (overwrite)
+            return 0;
+        cdb_seterrno(vio->db, CDB_READERR, __FILE__, __LINE__);
+        return -1;
+    }
+
+    for(int i = 0; i < myio->ifnum; i++) {
+        VIOAPND2FINFO finfo, *finfo2;
+        finfo.fid = *(uint32_t*)(hbuf + pos);
+        pos += SI4;
+        finfo.fsize = *(uint32_t*)(hbuf + pos);
+        pos += SI4;
+        finfo.rcyled = *(uint32_t*)(hbuf + pos);
+        pos += SI4;;
+        finfo.oidf = *(uint64_t*)(hbuf + pos);
+        pos += SI8;
+        finfo.oidl = *(uint64_t*)(hbuf + pos);
+        pos += SI8;
+        finfo.fstatus = *(uint8_t*)(hbuf + pos);
+        pos += 1;
+        finfo.ftype = *(uint8_t*)(hbuf + pos);
+        pos += 1;
+        finfo.ref = 0;
+        finfo.unlink = false;
+        if (overwrite) {
+            /* in recovery mode only fix 'recycled size' */
+            /* But do nothing with index files */
+            continue;
+        }
+        finfo2 = (VIOAPND2FINFO *)cdb_ht_insert2(myio->idxmeta, &finfo.fid, SI4, &finfo, sizeof(finfo));
+        if (myio->idxfhead) {
+            finfo2->fprev = myio->idxftail;
+            myio->idxftail->fnext = finfo2;
+            finfo2->fnext = NULL;
+            myio->idxftail = finfo2;
+        } else {
+            myio->idxfhead = myio->idxftail = finfo2;
+            finfo2->fprev = finfo2->fnext = NULL;
+        }
+    }
+
+    for(int i = 0; i < myio->dfnum; i++) {
+        VIOAPND2FINFO finfo, *finfo2;
+        finfo.fid = *(uint32_t*)(hbuf + pos);
+        pos += SI4;
+        finfo.fsize = *(uint32_t*)(hbuf + pos);
+        pos += SI4;
+        finfo.rcyled = *(uint32_t*)(hbuf + pos);
+        pos += SI4;
+        finfo.nexpire = *(uint32_t*)(hbuf + pos);
+        pos += SI4;
+        finfo.oidf = *(uint64_t*)(hbuf + pos);
+        pos += SI8;
+        finfo.oidl = *(uint64_t*)(hbuf + pos);
+        pos += SI8;
+        finfo.fstatus = *(uint8_t*)(hbuf + pos);
+        pos += 1;
+        finfo.ftype = *(uint8_t*)(hbuf + pos);
+        pos += 1;
+        finfo.ref = 0;
+        finfo.unlink = false;
+        finfo.lcktime = time(NULL);
+        if (overwrite) {
+            /* in recovery mode only fix 'recycled size' */
+            finfo2 = (VIOAPND2FINFO *)cdb_ht_get2(myio->datmeta, &finfo.fid, SI4, false);
+            if (finfo2) {
+                finfo2->rcyled = finfo.rcyled;
+                finfo2->nexpire = finfo.nexpire;
+            }
+            continue;
+        }
+        finfo2 = (VIOAPND2FINFO *)cdb_ht_insert2(myio->datmeta, &finfo.fid, SI4, &finfo, sizeof(finfo));
+        if (myio->datfhead) {
+            finfo2->fprev = myio->datftail;
+            myio->datftail->fnext = finfo2;
+            finfo2->fnext = NULL;
+            myio->datftail = finfo2;
+        } else {
+            myio->datfhead = myio->datftail = finfo2;
+            finfo2->fprev = finfo2->fnext = NULL;
+        }
+    }
+    cdb_lock_unlock(myio->lock);
+    free(hbuf);
+
+    return 0;
+}
+
+
+/* flush i/o buffer */
+static int _vio_apnd2_flushbuf(CDBVIO *vio, int dtype)
+{
+    VIOAPND2 *myio = (VIOAPND2 *)vio->iometa;
+    VIOAPND2FINFO *finfo;
+    VIOAPND2IOBUF *iobuf;
+    CDBHASHTABLE *ht;
+    uint32_t *fid;
+    uint32_t fsizemax;
+
+    /* link to the proper operation object */
+    if (dtype == VIOAPND2_INDEX) {
+        iobuf = &myio->ibuf;
+        ht = myio->idxmeta;
+        fsizemax = FIDXMAXSIZE;
+    } else if (dtype == VIOAPND2_DATA) {
+        iobuf = &myio->dbuf;
+        ht = myio->datmeta;
+        fsizemax = FDATMAXSIZE;
+    } else if (dtype == VIOAPND2_DELLOG) {
+        /* buffer for deletion is special */
+        if (myio->delbufpos == 0)
+            return 0;
+        if (write(myio->dfd, myio->delbuf, sizeof(FOFF) * myio->delbufpos)
+                != sizeof(FOFF) * myio->delbufpos) {
+            cdb_seterrno(vio->db, CDB_WRITEERR, __FILE__, __LINE__);
+            return -1;
+        }
+        myio->delbufpos = 0;
+        return 0;
+    } else {
+        cdb_seterrno(vio->db, CDB_INTERNALERR, __FILE__, __LINE__);
+        return -1;
+    }
+    fid = &iobuf->fid;
+
+    /* get information from table */
+    finfo = (VIOAPND2FINFO *)cdb_ht_get2(ht, fid, SI4, false);
+    if (finfo == NULL) {
+        cdb_seterrno(vio->db, CDB_INTERNALERR, __FILE__, __LINE__);
+        return -1;
+    }
+
+    /* write out if buffered */
+    if (iobuf->pos > 0) {
+        if (pwrite(iobuf->fd, iobuf->buf, iobuf->pos, iobuf->off) != iobuf->pos) {
+            /* to avoid compile warning */
+            if (ftruncate(iobuf->fd, iobuf->off) < 0) ;
+            cdb_seterrno(vio->db, CDB_WRITEERR, __FILE__, __LINE__);
+            return -1;
+        }
+    }
+
+    /* mark the operation id */
+    finfo->oidl = iobuf->oid;
+
+    /* reset the buffer information */
+    iobuf->pos = 0;
+    iobuf->off = lseek(iobuf->fd, 0, SEEK_END);
+    /* fix file size info whenever possible */
+    finfo->fsize = iobuf->off;
+    iobuf->off = OFFALIGNED(iobuf->off);
+
+    /* current writing file nearly full? open a new one */
+    if (iobuf->off > fsizemax - 16 * KB) {
+        finfo->fstatus = VIOAPND2_FULL;
+        _vio_apnd2_writefmeta(vio, iobuf->fd, finfo);
+        close(iobuf->fd);
+        _vio_apnd2_shiftnew(vio, dtype);
+    } else
+        iobuf->limit = CDBMIN(IOBUFSIZE, fsizemax - iobuf->off) ;
+
+    return 0;
+}
+
+/* create a new file for buffer and writing */
+static int _vio_apnd2_shiftnew(CDBVIO *vio, int dtype)
+{
+    VIOAPND2 *myio = (VIOAPND2*)vio->iometa;
+    VIOAPND2IOBUF *iobuf;
+    CDBHASHTABLE *ht;
+    uint32_t *fnum;
+    uint32_t tryiter, curfid;
+    char filename[MAX_PATH_LEN];
+    char ipfx[] = "idx";
+    char dpfx[] = "dat";
+    char *pfx;
+
+    /* link to proper object by dtype */
+    if (dtype == VIOAPND2_INDEX) {
+        iobuf = &myio->ibuf;
+        ht = myio->idxmeta;
+        fnum = &myio->ifnum;
+        pfx = ipfx;
+    } else if (dtype == VIOAPND2_DATA) {
+        iobuf = &myio->dbuf;
+        ht = myio->datmeta;
+        fnum = &myio->dfnum;
+        pfx = dpfx;
+    } else {
+        cdb_seterrno(vio->db, CDB_INTERNALERR, __FILE__, __LINE__);
+        return -1;
+    }
+
+    curfid = iobuf->fid;
+
+    /* reset invalid buffer, prevent for misuse */
+    iobuf->fd = -1;
+    iobuf->fid = 0xffffff;
+    iobuf->limit = iobuf->pos = iobuf->off = 0xffffffff;
+
+    /* find a valid fid, try 16M times at most */
+    tryiter = 0;
+    while(cdb_ht_exist(ht, &curfid, SI4)) {
+        curfid++;
+        tryiter++;
+        if (tryiter == 0xffffff) {
+            cdb_seterrno(vio->db, CDB_NOFID, __FILE__, __LINE__);
+            return -1;
+        }
+        if (curfid == 0xffffff)
+            curfid = 0;
+    }
+
+    /* open new file */
+    snprintf(filename, MAX_PATH_LEN, "%s/%s%08d.cdb", myio->filepath, pfx, curfid);
+    iobuf->fd = open(filename, O_RDWR | O_CREAT | O_TRUNC, 0644);
+    if (iobuf->fd < 0) {
+        cdb_seterrno(vio->db, CDB_OPENERR, __FILE__, __LINE__);
+        return -1;
+    }
+    iobuf->limit = IOBUFSIZE;
+    iobuf->fid = curfid;
+    iobuf->off = FILEMETASIZE;
+    iobuf->pos = 0;
+
+    /* set meta information for new file */
+    VIOAPND2FINFO finfo, *finfo2;
+    finfo.fsize = lseek(iobuf->fd, 0, SEEK_END);
+    finfo.oidf = iobuf->oid;
+    finfo.oidl = iobuf->oid;
+    finfo.rcyled = 0;
+    finfo.lcktime = time(NULL);
+    finfo.fstatus = VIOAPND2_WRITING;
+    finfo.ftype = dtype;
+    finfo.fid = curfid;
+    finfo.unlink = false;
+    finfo.nexpire = 0xffffffff;
+    finfo.ref = 0;
+    /* meta information also be written to disk immediately */
+    if (_vio_apnd2_writefmeta(vio, iobuf->fd, &finfo) < 0) {
+        close(iobuf->fd);
+        iobuf->fd = -1;
+        iobuf->fid = 0xffffff;
+        iobuf->limit = iobuf->pos = iobuf->off = 0xffffffff;
+        return -1;
+    }
+    (*fnum)++;
+    finfo2 = cdb_ht_insert2(ht, &curfid, SI4, &finfo, sizeof(VIOAPND2FINFO));
+    if (dtype == VIOAPND2_INDEX) {
+        if (myio->idxfhead) {
+            finfo2->fprev = myio->idxftail;
+            myio->idxftail->fnext = finfo2;
+            finfo2->fnext = NULL;
+            myio->idxftail = finfo2;
+        } else {
+            myio->idxfhead = myio->idxftail = finfo2;
+            finfo2->fprev = finfo2->fnext = NULL;
+        }
+    } else if (dtype == VIOAPND2_DATA) {
+        if (myio->datfhead) {
+            finfo2->fprev = myio->datftail;
+            myio->datftail->fnext = finfo2;
+            finfo2->fnext = NULL;
+            myio->datftail = finfo2;
+        } else {
+            myio->datfhead = myio->datftail = finfo2;
+            finfo2->fprev = finfo2->fnext = NULL;
+        }
+    }
+
+    return 0;
+}
+
+
+/* write a single file's meta information */
+static int _vio_apnd2_writefmeta(CDBVIO *vio, int fd, VIOAPND2FINFO *finfo)
+{
+    char buf[FILEMETASIZE];
+    int pos = 0;
+
+    memset(buf, 'X', FILEMETASIZE);
+    memcpy(buf, FILEMAGICHEADER, FILEMAGICLEN);
+    pos += FILEMAGICLEN;
+    *(uint64_t*)(buf + pos) = finfo->oidf;
+    pos += SI8;
+    *(uint64_t*)(buf + pos) = finfo->oidl;
+    pos += SI8;
+    *(uint32_t*)(buf + pos) = finfo->fsize;
+    pos += SI4;
+    *(uint32_t*)(buf + pos) = finfo->fid;
+    pos += SI4;
+    *(uint8_t*)(buf + pos) = finfo->fstatus;
+    pos++;
+    *(uint8_t*)(buf + pos) = finfo->ftype;
+    pos++;
+
+    if (pwrite(fd, buf, FILEMETASIZE, 0) != FILEMETASIZE) {
+        cdb_seterrno(vio->db, CDB_WRITEERR, __FILE__, __LINE__);
+        return -1;
+    }
+    return 0;
+}
+
+/* read a single file's meta information */
+static int _vio_apnd2_readfmeta(CDBVIO *vio, int fd, VIOAPND2FINFO *finfo)
+{
+    char buf[FILEMETASIZE];
+    int pos = 0;
+
+    memset(buf, 'X', FILEMETASIZE);
+    if (pread(fd, buf, FILEMETASIZE, 0) != FILEMETASIZE) {
+        cdb_seterrno(vio->db, CDB_READERR, __FILE__, __LINE__);
+        return -1;
+    }
+
+    if (memcmp(buf, FILEMAGICHEADER, FILEMAGICLEN)) {
+        cdb_seterrno(vio->db, CDB_DATAERRMETA, __FILE__, __LINE__);
+        return -1;
+    }
+
+    pos += FILEMAGICLEN;
+    finfo->oidf = *(uint64_t*)(buf + pos);
+    pos += SI8;
+    finfo->oidl = *(uint64_t*)(buf + pos);
+    pos += SI8;
+    finfo->fsize = *(uint32_t*)(buf + pos);
+    pos += SI4;
+    finfo->fid = *(uint32_t*)(buf + pos);
+    pos += SI4;
+    finfo->fstatus = *(uint8_t*)(buf + pos);
+    pos++;
+    finfo->ftype  = *(uint8_t*)(buf + pos);
+    pos++;
+    return 0;
+}
+
+
+/* write to disk directly instead of using buffer(Only Appends) */
+static int _vio_apnd2_write(CDBVIO *vio, int fd, void *buf, uint32_t size, bool aligned)
+{
+    VIOAPND2 *myio = (VIOAPND2 *)vio->iometa;
+    uint32_t off;
+
+    if (size == 0)
+        return 0;
+
+    off = lseek(fd, 0, SEEK_END);
+    if (aligned)
+        off = OFFALIGNED(off);
+    if (pwrite(fd, buf, size, off) != size) {
+        /* to avoid compile warning */
+        if (ftruncate(myio->ibuf.fd, off) < 0) ;
+        cdb_seterrno(vio->db, CDB_WRITEERR, __FILE__, __LINE__);
+        return -1;
+    }
+
+    return size;
+}
+
+
+/* read from disk; if data has not been written, read from buffer */
+static int _vio_apnd2_read(CDBVIO *vio, int fd, void *buf, uint32_t size, uint64_t off)
+{
+    VIOAPND2 *myio = (VIOAPND2 *)vio->iometa;
+    int ret;
+
+    /* in buffer? */
+    if (fd == myio->dbuf.fd && off >= myio->dbuf.off) {
+        uint64_t boff = off - myio->dbuf.off;
+        ret = CDBMIN(size, myio->dbuf.pos - boff);
+        memcpy(buf, myio->dbuf.buf + boff, ret);
+    } else if (fd == myio->ibuf.fd && off >= myio->ibuf.off) {
+        uint64_t boff = off - myio->ibuf.off;
+        ret = CDBMIN(size, myio->ibuf.pos - boff);
+        memcpy(buf, myio->ibuf.buf + boff, ret);
+    } else {
+        /* not in buffer */
+        ret = pread(fd, buf, size, off);
+        if (ret < 0) {
+            cdb_seterrno(vio->db, CDB_READERR, __FILE__, __LINE__);
+            return -1;
+        }
+    }
+    return ret;
+}
+
+
+/* write all files meta information into a file */
+static int _vio_apnd2_writemeta(CDBVIO *vio)
+{
+    VIOAPND2 *myio = (VIOAPND2 *)vio->iometa;
+    char buf[FILEMETASIZE];
+    char *hbuf;
+    int hbufsize;
+    int pos = 0;
+
+    memset(buf, 'X', FILEMETASIZE);
+    memcpy(buf, FILEMAGICHEADER, FILEMAGICLEN);
+    pos += FILEMAGICLEN;
+    cdb_lock_lock(myio->lock);
+    *(uint32_t*)(buf + pos) = myio->ibuf.off;
+    pos += SI4;
+    *(uint32_t*)(buf + pos) = myio->ibuf.limit;
+    pos += SI4;
+    *(uint32_t*)(buf + pos) = myio->dbuf.off;
+    pos += SI4;
+    *(uint32_t*)(buf + pos) = myio->dbuf.limit;
+    pos += SI4;
+    *(uint32_t*)(buf + pos) = myio->ifnum;
+    pos += SI4;
+    *(uint32_t*)(buf + pos) = myio->dfnum;
+    pos += SI4;
+    *(uint32_t*)(buf + pos) = myio->ibuf.fid;
+    pos += SI4;
+    *(uint32_t*)(buf + pos) = myio->dbuf.fid;
+    pos += SI4;
+
+    hbufsize = (SI4 + SI4 + SI4 + SI8 + SI8 + 1 + 1) * myio->ifnum;
+    hbufsize += (SI4 + SI4 + SI4 + SI4 + SI8 + SI8 + 1 + 1) * myio->dfnum;
+    hbuf = (char*)malloc(hbufsize);
+    memset(hbuf, 'X', hbufsize);
+    pos = 0;
+    /* iterate all the index files order by oid */
+    VIOAPND2FINFO *finfo = myio->idxfhead;
+    while(finfo != NULL) {
+        *(uint32_t*)(hbuf + pos) = finfo->fid;
+        pos += 4;
+        *(uint32_t*)(hbuf + pos) = finfo->fsize;
+        pos += 4;
+        *(uint32_t*)(hbuf + pos) = finfo->rcyled;
+        pos += 4;
+        *(uint64_t*)(hbuf + pos) = finfo->oidf;
+        pos += 8;
+        *(uint64_t*)(hbuf + pos) = finfo->oidl;
+        pos += 8;
+        *(uint8_t*)(hbuf + pos) = finfo->fstatus;
+        pos += 1;
+        *(uint8_t*)(hbuf + pos) = finfo->ftype;
+        pos += 1;
+        finfo = finfo->fnext;
+    }
+
+    /* iterate all the data files order by oid */
+    finfo = myio->datfhead;
+    while(finfo != NULL) {
+        *(uint32_t*)(hbuf + pos) = finfo->fid;
+        pos += 4;
+        *(uint32_t*)(hbuf + pos) = finfo->fsize;
+        pos += 4;
+        *(uint32_t*)(hbuf + pos) = finfo->rcyled;
+        pos += 4;
+        *(uint32_t*)(hbuf + pos) = finfo->nexpire;
+        pos += 4;
+        *(uint64_t*)(hbuf + pos) = finfo->oidf;
+        pos += 8;
+        *(uint64_t*)(hbuf + pos) = finfo->oidl;
+        pos += 8;
+        *(uint8_t*)(hbuf + pos) = finfo->fstatus;
+        pos += 1;
+        *(uint8_t*)(hbuf + pos) = finfo->ftype;
+        pos += 1;
+        finfo = finfo->fnext;
+    }
+    cdb_lock_unlock(myio->lock);
+
+    if (pwrite(myio->mfd, buf, FILEMETASIZE, 0) != FILEMETASIZE) {
+        cdb_seterrno(vio->db, CDB_WRITEERR, __FILE__, __LINE__);
+        free(hbuf);
+        return -1;
+    }
+
+    if (pwrite(myio->mfd, hbuf, hbufsize, FILEMETASIZE) != hbufsize) {
+        cdb_seterrno(vio->db, CDB_WRITEERR, __FILE__, __LINE__);
+        free(hbuf);
+        return -1;
+    }
+    free(hbuf);
+
+    return 0;
+}
+
+
+/* close db */
+static int _vio_apnd2_close(CDBVIO *vio)
+{
+    VIOAPND2 *myio = (VIOAPND2 *)vio->iometa;
+    CDBHTITEM *item;
+    char filename[MAX_PATH_LEN] = {0};
+    VIOAPND2FINFO *finfo;
+
+    /* flush buffer */
+    _vio_apnd2_flushbuf(vio, VIOAPND2_INDEX);
+    finfo = (VIOAPND2FINFO *)cdb_ht_get2(myio->idxmeta, &myio->ibuf.fid, SI4, false);
+    if (finfo)
+        _vio_apnd2_writefmeta(vio, myio->ibuf.fd, finfo);
+    _vio_apnd2_flushbuf(vio, VIOAPND2_DATA);
+    finfo = (VIOAPND2FINFO *)cdb_ht_get2(myio->datmeta, &myio->dbuf.fid, SI4, false);
+    if (finfo)
+        _vio_apnd2_writefmeta(vio, myio->dbuf.fd, finfo);
+
+    /* iterate and close the fd cache */
+    item = cdb_ht_iterbegin(myio->fdcache);
+    while(item != NULL) {
+        close(*(int*)cdb_ht_itemval(myio->fdcache, item));
+        item = cdb_ht_iternext(myio->fdcache, item);
+    }
+
+    if (myio->dbuf.fd > 0)
+        close(myio->dbuf.fd);
+    if (myio->ibuf.fd > 0)
+        close(myio->ibuf.fd);
+
+    /* rewrite the metafile */
+    _vio_apnd2_writemeta(vio);
+    /* close all open files */
+    snprintf(filename, MAX_PATH_LEN, "%s/pid.cdb", myio->filepath);
+    unlink(filename);
+    /* dellog only be useful for recovery of database unsafety close */
+    snprintf(filename, MAX_PATH_LEN, "%s/dellog.cdb", myio->filepath);
+    unlink(filename);
+    _vio_apnd2_setopensig(vio, VIOAPND2_SIGCLOSED);
+    if (myio->hfd > 0)
+        close(myio->hfd);
+    if (myio->mfd > 0)
+        close(myio->mfd);
+    if (myio->dfd > 0)
+        close(myio->dfd);
+    _vio_apnd2_destroy(vio);
+    return 0;
+}
+
+
+/* open a file, and remember its fd. The function runs under lock protection */
+static int _vio_apnd2_loadfd(CDBVIO *vio, uint32_t fid, int dtype)
+{
+    VIOAPND2 *myio = (VIOAPND2*)vio->iometa;
+    int fd;
+    char filename[MAX_PATH_LEN];
+    char ipfx[] = "idx";
+    char dpfx[] = "dat";
+    char *pfx;
+    uint32_t vfid;
+
+    if (dtype == VIOAPND2_INDEX) {
+        pfx = ipfx;
+        vfid = VFIDIDX(fid);
+    } else if (dtype == VIOAPND2_DATA) {
+        pfx = dpfx;
+        vfid = VFIDDAT(fid);
+    } else {
+        cdb_seterrno(vio->db, CDB_INTERNALERR, __FILE__, __LINE__);
+        return -1;
+    }
+
+    snprintf(filename, MAX_PATH_LEN, "%s/%s%08d.cdb", myio->filepath, pfx, fid);
+    fd = open(filename, O_RDONLY, 0644);
+    if (fd < 0) {
+        cdb_seterrno(vio->db, CDB_OPENERR, __FILE__, __LINE__);
+        return -1;
+    }
+
+    /* cache the fd, close the oldest file not touched */
+    cdb_ht_insert2(myio->fdcache, &vfid, SI4, &fd, sizeof(int));
+    while(myio->fdcache->num > myio->maxfds) {
+        CDBHTITEM *item = cdb_ht_poptail(myio->fdcache);
+        close(*(int*)cdb_ht_itemval(myio->fdcache, item));
+        free(item);
+    }
+
+    return fd;
+}
+
+/* read a index page */
+static int _vio_apnd2_readpage(CDBVIO *vio, CDBPAGE **page, FOFF off)
+{
+    VIOAPND2 *myio = (VIOAPND2*)vio->iometa;
+    int ret, fd;
+    uint32_t psize;
+    uint32_t fid, roff;
+    uint32_t fixbufsize = SBUFSIZE - (sizeof(CDBPAGE) - PAGEHSIZE);
+    uint32_t areadsize = PAGEAREADSIZE; //vio->db->areadsize;
+
+    VOFF2ROFF(off, fid, roff);
+    /* avoid dirty memory */
+    (*page)->magic = 0;
+
+    cdb_lock_lock(myio->lock);
+    if (fid == myio->ibuf.fid)
+        /* read from current writing file? */
+        fd = myio->ibuf.fd;
+    else {
+        /* old index file */
+        int vfid, *fdret;
+        vfid = VFIDIDX(fid);
+        /* in cache? */
+        fdret = cdb_ht_get2(myio->fdcache, &vfid, sizeof(vfid), true);
+        if (fdret == NULL) {
+            fd = _vio_apnd2_loadfd(vio, fid, VIOAPND2_INDEX);
+            if (fd < 0) {
+                cdb_lock_unlock(myio->lock);
+                return -1;
+            }
+        } else
+            fd = *fdret;
+    }
+
+    /* NOTICE: the data on disk actually starts at 'magic' field in structure */
+    ret = _vio_apnd2_read(vio, fd, &(*page)->magic, areadsize, roff);
+    if (ret <= 0) {
+        cdb_lock_unlock(myio->lock);
+        return -1;
+    }
+
+    if ((*page)->magic != PAGEMAGIC) {
+        cdb_lock_unlock(myio->lock);
+        cdb_seterrno(vio->db, CDB_DATAERRIDX, __FILE__, __LINE__);
+        return -1;
+    }
+
+    psize = PAGESIZE(*page);
+    if (ret < areadsize && ret < psize) {
+        cdb_lock_unlock(myio->lock);
+        cdb_seterrno(vio->db, CDB_DATAERRIDX, __FILE__, __LINE__);
+        return ret;
+    } else if (psize > areadsize) {
+        /* need another read operation since the page is a large than default read size */
+        if (psize > fixbufsize) {
+            /* record is larger the stack size */
+            CDBPAGE *npage = (CDBPAGE *)malloc(sizeof(CDBPAGE) + (*page)->num * sizeof(PITEM));
+            memcpy(&npage->magic, &(*page)->magic, areadsize);
+            *page = npage;
+        }
+
+        ret = _vio_apnd2_read(vio, fd, (char*)&(*page)->magic + areadsize,
+            psize - areadsize, roff + areadsize);
+        if (ret < psize - areadsize) {
+            cdb_lock_unlock(myio->lock);
+            cdb_seterrno(vio->db, CDB_DATAERRIDX, __FILE__, __LINE__);
+            return -1;
+        }
+    }
+
+    cdb_lock_unlock(myio->lock);
+
+    /* remember where i got the page, calculate into junk space if page is discarded */
+    (*page)->osize = OFFALIGNED(psize);
+    (*page)->ooff = off;
+    (*page)->cap = (*page)->num;
+    return 0;
+}
+
+/* read a data record */
+static int _vio_apnd2_readrec(CDBVIO *vio, CDBREC** rec, FOFF off, bool readval)
+{
+    VIOAPND2 *myio = (VIOAPND2*)vio->iometa;
+    int ret, fd;
+    uint32_t rsize;
+    uint32_t fid, roff;
+    /* the 'rec' is hoped to be fit in stack, the actually size is a little smaller */
+    /* because some fields in CDBREC structure are not on disk */
+    uint32_t fixbufsize = SBUFSIZE - (sizeof(CDBREC) - RECHSIZE);
+    uint32_t areadsize = vio->db->areadsize;
+
+    VOFF2ROFF(off, fid, roff);
+    /* avoid dirty memory */
+    (*rec)->magic = 0;
+
+    cdb_lock_lock(myio->lock);
+    if (fid == myio->dbuf.fid)
+        /* read from current writing file? */
+        fd = myio->dbuf.fd;
+    else {
+        /* read from old data file */
+        int vfid, *fdret;
+        vfid = VFIDDAT(fid);
+        fdret = cdb_ht_get2(myio->fdcache, &vfid, sizeof(vfid), true);
+        if (fdret == NULL) {
+            fd = _vio_apnd2_loadfd(vio, fid, VIOAPND2_DATA);
+            if (fd < 0) {
+                cdb_lock_unlock(myio->lock);
+                return -1;
+            }
+        } else
+            fd = *fdret;
+    }
+
+    /* NOTICE: the data on disk actually starts at 'magic' field in structure */
+    ret = _vio_apnd2_read(vio, fd, &(*rec)->magic, areadsize, roff);
+    if (ret <= 0) {
+        cdb_lock_unlock(myio->lock);
+        return -1;
+    }
+
+    if ((*rec)->magic != RECMAGIC) {
+        cdb_lock_unlock(myio->lock);
+        cdb_seterrno(vio->db, CDB_DATAERRDAT, __FILE__, __LINE__);
+        return -1;
+    }
+
+    uint32_t ovsize = (*rec)->vsize;
+    if (!readval)
+        /* read key only */
+        (*rec)->vsize = 0;
+    rsize = RECSIZE(*rec);
+
+    if (ret < areadsize && ret < rsize) {
+        cdb_lock_unlock(myio->lock);
+        cdb_seterrno(vio->db, CDB_DATAERRDAT, __FILE__, __LINE__);
+        return -1;
+    } else if (rsize > areadsize) {
+        /* need another read */
+        if (rsize > fixbufsize) {
+            /* record is larger the stack size */
+            CDBREC *nrec = (CDBREC *)malloc(sizeof(CDBREC)+(*rec)->ksize+(*rec)->vsize);
+            memcpy(&nrec->magic, &(*rec)->magic, areadsize);
+            *rec = nrec;
+        }
+        ret = _vio_apnd2_read(vio, fd, (char*)&(*rec)->magic + areadsize,
+            rsize - areadsize, roff + areadsize);
+        if (ret != rsize - areadsize) {
+            cdb_lock_unlock(myio->lock);
+            cdb_seterrno(vio->db, CDB_DATAERRDAT, __FILE__, __LINE__);
+            return -1;
+        }
+    }
+    cdb_lock_unlock(myio->lock);
+
+    /* fix pointer */
+    (*rec)->key = (*rec)->buf;
+    (*rec)->val = (*rec)->buf + (*rec)->ksize;
+
+    /* even if didn't read the value, still keep the complete (old) size */
+    if (!readval)
+        (*rec)->osize = OFFALIGNED(rsize + ovsize);
+    else
+        (*rec)->osize = OFFALIGNED(rsize);
+
+    (*rec)->ooff = off;
+    return 0;
+}
+
+
+/* write a index page, return the written virtual offset */
+static int _vio_apnd2_writepage(CDBVIO *vio, CDBPAGE *page, FOFF *off)
+{
+    VIOAPND2 *myio = (VIOAPND2*)vio->iometa;
+    VIOAPND2FINFO *finfo;
+    uint32_t psize = PAGESIZE(page);
+    uint32_t fid, roff;
+    uint32_t ofid;
+
+    page->magic = PAGEMAGIC;
+    page->oid = cdb_genoid(vio->db);
+
+    cdb_lock_lock(myio->lock);
+    /* buffer ready? */
+    if (myio->ibuf.fd < 0) {
+        if (_vio_apnd2_shiftnew(vio, VIOAPND2_INDEX) < 0) {
+            cdb_lock_unlock(myio->lock);
+            return -1;
+        }
+    }
+
+    /* if it was modified from existing page, remember the wasted space */
+    if (OFFNOTNULL(page->ooff)) {
+        VOFF2ROFF(page->ooff, ofid, roff);
+        finfo = (VIOAPND2FINFO *)cdb_ht_get2(myio->idxmeta, &ofid, SI4, false);
+        if (finfo)
+            finfo->rcyled += page->osize;
+    }
+
+    if (psize > myio->ibuf.limit) {
+        /* page too large  */
+        _vio_apnd2_flushbuf(vio, VIOAPND2_INDEX);
+        fid = myio->ibuf.fid;
+        roff = myio->ibuf.off;
+        _vio_apnd2_write(vio, myio->ibuf.fd, &page->magic, psize, true);
+        myio->ibuf.oid = page->oid;
+        _vio_apnd2_flushbuf(vio, VIOAPND2_INDEX);
+        cdb_lock_unlock(myio->lock);
+
+        /* remember last wrote offset */
+        ROFF2VOFF(fid, roff, *off);
+        page->ooff = *off;
+        page->osize = OFFALIGNED(psize);
+        return 0;
+    } else if (psize + myio->ibuf.pos > myio->ibuf.limit)
+        /* buffer is full */
+        _vio_apnd2_flushbuf(vio, VIOAPND2_INDEX);
+
+    /* copy to buffer */
+    fid = myio->ibuf.fid;
+    roff = myio->ibuf.off + myio->ibuf.pos;
+    memcpy(myio->ibuf.buf + myio->ibuf.pos, &page->magic, psize);
+    myio->ibuf.pos += psize;
+    myio->ibuf.pos = OFFALIGNED(myio->ibuf.pos);
+    myio->ibuf.oid = page->oid;
+    cdb_lock_unlock(myio->lock);
+    ROFF2VOFF(fid, roff, *off);
+
+    /* remember last wrote offset */
+    page->ooff = *off;
+    page->osize = OFFALIGNED(psize);
+    return 0;
+}
+
+
+/* delete a record */
+static int _vio_apnd2_deleterec(CDBVIO *vio, CDBREC *rec, FOFF off)
+{
+    VIOAPND2 *myio = (VIOAPND2*)vio->iometa;
+    uint32_t ofid, roff;
+
+    cdb_lock_lock(myio->lock);
+    myio->delbuf[myio->delbufpos] = off;
+    if (++myio->delbufpos == DELBUFMAX) {
+        if (_vio_apnd2_flushbuf(vio, VIOAPND2_DELLOG) < 0)
+            return -1;
+    }
+
+    /* it is an deleted record, remember the space to be recycled */
+    VOFF2ROFF(off, ofid, roff);
+    if (OFFNOTNULL(rec->ooff)) {
+        VIOAPND2FINFO *finfo = (VIOAPND2FINFO *)cdb_ht_get2(myio->datmeta, &ofid, SI4, false);
+        if (finfo) {
+            finfo->rcyled += rec->osize;
+        }
+    }
+    cdb_lock_unlock(myio->lock);
+    return 0;
+}
+
+
+
+/* write a data record, return the written virtual offset */
+static int _vio_apnd2_writerec(CDBVIO *vio, CDBREC *rec, FOFF *off, int ptrtype) {
+    VIOAPND2 *myio = (VIOAPND2*)vio->iometa;
+    uint32_t rsize = RECSIZE(rec);
+    uint32_t fid, roff, ofid;
+    if (ptrtype == VIOAPND2_RECEXTERNAL)
+        rec->magic = RECMAGIC;
+
+    /* oid always are increment, even if it is a record moved from an old data file */
+    rec->oid = cdb_genoid(vio->db);
+    cdb_lock_lock(myio->lock);
+    /* buffer ready? */
+    if (myio->dbuf.fd < 0) {
+        if (_vio_apnd2_shiftnew(vio, VIOAPND2_DATA) < 0) {
+            cdb_lock_unlock(myio->lock);
+            return -1;
+        }
+    }
+    /* it is an overwritten record, remember the space to be recycled */
+    if (OFFNOTNULL(rec->ooff)) {
+        VOFF2ROFF(rec->ooff, ofid, roff);
+        VIOAPND2FINFO *finfo = (VIOAPND2FINFO *)cdb_ht_get2(myio->datmeta, &ofid, SI4, false);
+        if (finfo)
+            finfo->rcyled += rec->osize;
+    }
+    if (rsize > myio->dbuf.limit) {
+        /* record too large */
+        _vio_apnd2_flushbuf(vio, VIOAPND2_DATA);
+        fid = myio->dbuf.fid;
+        roff = myio->dbuf.off;
+        _vio_apnd2_write(vio, myio->dbuf.fd, &rec->magic, RECHSIZE, true);
+        if (ptrtype == VIOAPND2_RECINTERNAL)
+            _vio_apnd2_write(vio, myio->dbuf.fd, rec->buf, rec->ksize + rec->vsize, false);
+        else {
+            _vio_apnd2_write(vio, myio->dbuf.fd, rec->key, rec->ksize, false);
+            _vio_apnd2_write(vio, myio->dbuf.fd, rec->val, rec->vsize, false);
+        }
+        /* reset the buffer */
+        myio->dbuf.oid = rec->oid;
+        _vio_apnd2_flushbuf(vio, VIOAPND2_DATA);
+        if (rec->expire) {
+            VIOAPND2FINFO *finfo = (VIOAPND2FINFO *)cdb_ht_get2(myio->datmeta, &fid, SI4, false);
+            if (finfo) {
+                if (finfo->nexpire == 0) {
+                    finfo->lcktime = time(NULL);
+                    finfo->nexpire = rec->expire;
+                } else if (finfo->nexpire > rec->expire) {
+                    finfo->nexpire = rec->expire;
+                }
+            }
+        }
+        cdb_lock_unlock(myio->lock);
+        ROFF2VOFF(fid, roff, *off);
+        return 0;
+    } else if (rsize + myio->dbuf.pos > myio->dbuf.limit)
+        /* buffer is full */
+        _vio_apnd2_flushbuf(vio, VIOAPND2_DATA);
+    /* copy to buffer */
+    fid = myio->dbuf.fid;
+    roff = myio->dbuf.off + myio->dbuf.pos;
+    memcpy(myio->dbuf.buf + myio->dbuf.pos, &rec->magic, RECHSIZE);
+    myio->dbuf.pos += RECHSIZE;
+    if (ptrtype == VIOAPND2_RECINTERNAL) {
+        memcpy(myio->dbuf.buf + myio->dbuf.pos, rec->buf, rec->ksize + rec->vsize);
+        myio->dbuf.pos += rec->ksize + rec->vsize;
+    } else {
+        memcpy(myio->dbuf.buf + myio->dbuf.pos, rec->key, rec->ksize);
+        myio->dbuf.pos += rec->ksize;
+        memcpy(myio->dbuf.buf + myio->dbuf.pos, rec->val, rec->vsize);
+        myio->dbuf.pos += rec->vsize;
+    }
+    myio->dbuf.pos = OFFALIGNED(myio->dbuf.pos);
+    myio->dbuf.oid = rec->oid;
+    if (rec->expire) {
+        VIOAPND2FINFO *finfo = (VIOAPND2FINFO *)cdb_ht_get2(myio->datmeta, &fid, SI4, false);
+        if (finfo) {
+            if (finfo->nexpire == 0) {
+                finfo->lcktime = time(NULL);
+                finfo->nexpire = rec->expire;
+            } else if (finfo->nexpire > rec->expire) {
+                finfo->nexpire = rec->expire;
+            }
+        }
+    }
+    ROFF2VOFF(fid, roff, *off);
+    cdb_lock_unlock(myio->lock);
+    rec->osize = rsize;
+    rec->ooff = *off;
+    return 0;
+}
+
+static int _vio_apnd2_writerecexternal(CDBVIO *vio, CDBREC *rec, FOFF *off)
+{
+    return _vio_apnd2_writerec(vio, rec, off, VIOAPND2_RECEXTERNAL);
+}
+
+static int _vio_apnd2_writerecinternal(CDBVIO *vio, CDBREC *rec, FOFF *off)
+{
+    return _vio_apnd2_writerec(vio, rec, off, VIOAPND2_RECINTERNAL);
+}
+
+
+/* flush buffers, and sync data to disk from OS cache */
+static int _vio_apnd2_sync(CDBVIO *vio)
+{
+    VIOAPND2 *myio = (VIOAPND2 *)vio->iometa;
+    cdb_lock_lock(myio->lock);
+    _vio_apnd2_flushbuf(vio, VIOAPND2_DATA);
+    _vio_apnd2_flushbuf(vio, VIOAPND2_INDEX);
+    if (myio->dbuf.fd > 0)
+        fdatasync(myio->dbuf.fd);
+    if (myio->ibuf.fd > 0)
+        fdatasync(myio->ibuf.fd);
+
+    _vio_apnd2_writehead(vio, false);
+    cdb_lock_unlock(myio->lock);
+    return 0;
+}
+
+
+/* write db information and main index table into a single file */
+static int _vio_apnd2_writehead(CDBVIO *vio, bool wtable)
+{
+    VIOAPND2 *myio = (VIOAPND2 *)vio->iometa;
+    CDB *db = vio->db;
+    char buf[FILEMETASIZE];
+    int pos = 0;
+
+    memset(buf, 'X', FILEMETASIZE);
+    memcpy(buf, FILEMAGICHEADER, FILEMAGICLEN);
+    pos += FILEMAGICLEN;
+    *(uint32_t*)(buf + pos) = db->hsize;
+    pos += SI4;
+    *(uint64_t*)(buf + pos) = db->oid;
+    pos += SI8;
+    *(uint64_t*)(buf + pos) = db->roid;
+    pos += SI8;
+    *(uint64_t*)(buf + pos) = db->rnum;
+    pos += SI8;
+    *(uint32_t*)(buf + pos) = VIOAPND2_SIGOPEN;
+    pos += SI4;
+
+    if (pwrite(myio->hfd, buf, FILEMETASIZE, 0) != FILEMETASIZE) {
+        cdb_seterrno(vio->db, CDB_WRITEERR, __FILE__, __LINE__);
+        return -1;
+    }
+
+    if (wtable && pwrite(myio->hfd, db->mtable, sizeof(FOFF) * db->hsize, FILEMETASIZE)
+        != sizeof(FOFF) * db->hsize) {
+            cdb_seterrno(vio->db, CDB_WRITEERR, __FILE__, __LINE__);
+            return -1;
+    }
+    return 0;
+}
+
+
+/* wrapped for upper layer */
+static int _vio_apnd2_writehead2(CDBVIO *vio)
+{
+    return _vio_apnd2_writehead(vio, true);
+}
+
+
+/* read db information and main index table from a single file */
+static int _vio_apnd2_readhead(CDBVIO *vio, bool rtable)
+{
+    VIOAPND2 *myio = (VIOAPND2 *)vio->iometa;
+    CDB *db = vio->db;
+    char buf[FILEMETASIZE];
+    int pos = 0;
+
+    if (myio->create) {
+        /* the db is just created, allocate a empty main index table for db */
+        db->mtable = (FOFF *)malloc(sizeof(FOFF) * db->hsize);
+        memset(db->mtable, 0, sizeof(FOFF) * db->hsize);
+        _vio_apnd2_writehead(vio, false);
+        return 0;
+    }
+
+    if (pread(myio->hfd, buf, FILEMETASIZE, 0) != FILEMETASIZE) {
+        cdb_seterrno(db, CDB_READERR, __FILE__, __LINE__);
+        return -1;
+    }
+
+    if (memcmp(buf, FILEMAGICHEADER, FILEMAGICLEN)) {
+        cdb_seterrno(db, CDB_DATAERRMETA, __FILE__, __LINE__);
+        return -1;
+    }
+
+    pos += FILEMAGICLEN;
+    db->hsize = *(uint32_t*)(buf + pos);
+    pos += SI4;
+    db->oid = *(uint64_t*)(buf + pos);
+    pos += SI8;
+    db->roid = *(uint64_t*)(buf + pos);
+    pos += SI8;
+    db->rnum = *(uint64_t*)(buf + pos);
+    pos += SI8;
+    /* 4 bytes reserved for open status */
+    pos += SI4;
+
+    if (!rtable)
+        return 0;
+
+    if (db->mtable)
+        free(db->mtable);
+    db->mtable = (FOFF *)malloc(sizeof(FOFF) * db->hsize);
+    if (pread(myio->hfd, db->mtable, sizeof(FOFF) * db->hsize, FILEMETASIZE) !=
+        sizeof(FOFF) * db->hsize) {
+            free(db->mtable);
+            cdb_seterrno(db, CDB_READERR, __FILE__, __LINE__);
+            return -1;
+    }
+    return 0;
+}
+
+
+/* wrapped for upper layer */
+static int _vio_apnd2_readhead2(CDBVIO *vio)
+{
+    return _vio_apnd2_readhead(vio, true);
+}
+
+
+/* check if some dat file has too large junk space */
+static void _vio_apnd2_rcyledataspacetask(void *arg)
+{
+    CDBVIO *vio = (CDBVIO *)arg;
+    VIOAPND2 *myio = (VIOAPND2 *)vio->iometa;
+    CDBHTITEM *item;
+    uint32_t now = time(NULL);
+    uint32_t posblexpnum = 0;
+    cdb_lock_lock(myio->lock);
+    item = cdb_ht_iterbegin(myio->datmeta);
+    while(item != NULL) {
+        VIOAPND2FINFO *finfo = (VIOAPND2FINFO*)cdb_ht_itemval(myio->datmeta, item);
+        if (finfo->nexpire && finfo->nexpire <= now)
+            posblexpnum++;
+        item = cdb_ht_iternext(myio->datmeta, item);
+    }
+
+    item = cdb_ht_iterbegin(myio->datmeta);
+    while(item != NULL) {
+        VIOAPND2FINFO *finfo = (VIOAPND2FINFO*)cdb_ht_itemval(myio->datmeta, item);
+        uint32_t fid = finfo->fid;
+        /* rcyled space size is inaccurate */
+        if (finfo->rcyled * 2 < finfo->fsize
+            /* no data file possibly has expire record */
+            && (posblexpnum == 0
+            /* long enough time passed since last check on this file */
+            || finfo->lcktime + posblexpnum * DATARCYLECHECKFACTOR > now
+            /* check the data file most recent expire record */
+            || finfo->nexpire > now
+            /* no expire record */
+            || finfo->nexpire == 0)) {
+            item = cdb_ht_iternext(myio->datmeta, item);
+            continue;
+        }
+
+        /* do not work on the writing file or file to be deleted */
+        if (finfo->fstatus != VIOAPND2_FULL || finfo->unlink) {
+            item = cdb_ht_iternext(myio->datmeta, item);
+            continue;
+        }
+
+        /* have to iterate and calculate recycle space */
+        finfo->ref++;
+        /* operation on this file should not in lock protection */
+        cdb_lock_unlock(myio->lock);
+
+        if (finfo->rcyled * 2 < finfo->fsize) {
+            _vio_apnd2_rcyledatafile(vio, finfo, false);
+            finfo->lcktime = now;
+        }
+
+        if (finfo->rcyled * 2 >= finfo->fsize) {
+            _vio_apnd2_rcyledatafile(vio, finfo, true);
+        }
+
+        cdb_lock_lock(myio->lock);
+        finfo->ref--;
+        if (finfo->ref == 0 && finfo->unlink) {
+            /* unlink the file */
+            _vio_apnd2_unlink(vio, finfo, VIOAPND2_DATA);
+            cdb_ht_del2(myio->datmeta, &fid, SI4);
+        }
+        item = cdb_ht_iterbegin(myio->datmeta);
+    }
+    cdb_lock_unlock(myio->lock);
+}
+
+/* only be called in _vio_apnd2_rcylepagespacetask; when a page is moved into a new
+  index file, its ooff should be changed, also its copy in cache should be updated */
+static void _vio_apnd2_fixcachepageooff(CDB *db, uint32_t bid, FOFF off)
+{
+    CDBPAGE *page = NULL;
+
+    if (db->pcache) {
+        cdb_lock_lock(db->pclock);
+        page = cdb_ht_get2(db->pcache, &bid, SI4, true);
+        cdb_lock_unlock(db->pclock);
+    }
+
+    /* not in pcache, exists in dirty page cache? */
+    if (page == NULL && db->dpcache) {
+        cdb_lock_lock(db->dpclock);
+        page = cdb_ht_get2(db->dpcache, &bid, SI4, true);
+        cdb_lock_unlock(db->dpclock);
+    }
+
+    if (page)
+        page->ooff = off;
+}
+
+/* check if some index file has too large junk space */
+static void _vio_apnd2_rcylepagespacetask(void *arg)
+{
+    CDBVIO *vio = (CDBVIO *)arg;
+    VIOAPND2 *myio = (VIOAPND2 *)vio->iometa;
+    CDBHTITEM *item;
+
+    cdb_lock_lock(myio->lock);
+    item = cdb_ht_iterbegin(myio->idxmeta);
+    while(item != NULL) {
+        VIOAPND2FINFO *finfo = (VIOAPND2FINFO*)cdb_ht_itemval(myio->idxmeta, item);
+        uint32_t fid = finfo->fid;
+
+        /* do not work on the writing file or file to be deleted */
+        if (finfo->fstatus != VIOAPND2_FULL || finfo->unlink) {
+            item = cdb_ht_iternext(myio->idxmeta, item);
+            continue;
+        }
+
+        /* junk space too large? */
+        if (finfo->rcyled * 2 > finfo->fsize) {
+            int fd;
+            char filename[MAX_PATH_LEN];
+            snprintf(filename, MAX_PATH_LEN, "%s/idx%08d.cdb", myio->filepath, fid);
+            fd = open(filename, O_RDONLY, 0644);
+            if (fd < 0) {
+                cdb_seterrno(vio->db, CDB_OPENERR, __FILE__, __LINE__);
+                item = cdb_ht_iternext(myio->idxmeta, item);
+                continue;
+            }
+            finfo->ref++;
+            /* I/O should not block the lock */
+            cdb_lock_unlock(myio->lock);
+
+            uint32_t fsize = lseek(fd, 0, SEEK_END);
+            uint32_t pos = FILEMETASIZE;
+            char *map = mmap(NULL, fsize, PROT_READ, MAP_PRIVATE, fd, 0);
+            while(pos < fsize) {
+                CDBPAGE *page = (CDBPAGE *)&map[pos-(sizeof(CDBPAGE) - PAGEHSIZE)];
+                FOFF off;
+
+                if (page->magic != PAGEMAGIC) {
+                    pos += ALIGNBYTES;
+                    continue;
+                }
+
+                ROFF2VOFF(fid, pos, off);
+                page->ooff = off;
+                page->osize = OFFALIGNED(PAGESIZE(page));
+                if (OFFEQ(vio->db->mtable[page->bid], off)) {
+                    FOFF noff;
+                    _vio_apnd2_writepage(vio, page, &noff);
+                    /* lock and double check */
+                    cdb_lock_lock(vio->db->mlock[page->bid % MLOCKNUM]);
+                    if (OFFEQ(vio->db->mtable[page->bid], off)) {
+                        vio->db->mtable[page->bid] = noff;
+                        _vio_apnd2_fixcachepageooff(vio->db, page->bid, noff);
+                    }
+                    cdb_lock_unlock(vio->db->mlock[page->bid % MLOCKNUM]);
+                }
+                pos += OFFALIGNED(PAGESIZE(page));
+            }
+            munmap(map, fsize);
+            close(fd);
+
+            cdb_lock_lock(myio->lock);
+            /* drop information for the file */
+            finfo->ref--;
+            finfo->unlink = true;
+            if (finfo->ref == 0) {
+                /* unlink the file */
+                _vio_apnd2_unlink(vio, finfo, VIOAPND2_INDEX);
+                cdb_ht_del2(myio->idxmeta, &fid, SI4);
+            }
+            /* reset the iterator */
+            item = cdb_ht_iterbegin(myio->idxmeta);
+            continue;
+        }
+        item = cdb_ht_iternext(myio->idxmeta, item);
+    }
+    cdb_lock_unlock(myio->lock);
+}
+
+
+/* unlink a file and remove fd from fdcache. The function runs under lock protection */
+static void _vio_apnd2_unlink(CDBVIO *vio, VIOAPND2FINFO *finfo, int dtype)
+{
+    VIOAPND2 *myio = (VIOAPND2 *)vio->iometa;
+    char filename[MAX_PATH_LEN];
+    char ipfx[] = "idx";
+    char dpfx[] = "dat";
+    char *pfx;
+    uint32_t *fnum;
+    uint32_t vfid, fid = finfo->fid;
+    VIOAPND2FINFO **fhead, **ftail;
+    CDBHTITEM *fditem = NULL;
+
+    if (dtype == VIOAPND2_INDEX) {
+        pfx = ipfx;
+        vfid = VFIDIDX(fid);
+        fnum = &myio->ifnum;
+        fhead = &myio->idxfhead;
+        ftail = &myio->idxftail;
+    } else if (dtype == VIOAPND2_DATA) {
+        pfx = dpfx;
+        vfid = VFIDDAT(fid);
+        fnum = &myio->dfnum;
+        fhead = &myio->datfhead;
+        ftail = &myio->datftail;
+    } else
+        return;
+
+    snprintf(filename, MAX_PATH_LEN, "%s/%s%08d.cdb", myio->filepath, pfx, fid);
+    fditem = cdb_ht_del(myio->fdcache, &vfid, SI4);
+    if (fditem != NULL) {
+        close(*(int*)cdb_ht_itemval(myio->fdcache, fditem));
+        free(fditem);
+    }
+    (*fnum)--;
+    unlink(filename);
+
+    /* fix linked list of data/index files after remove a finfo from meta table */
+    if (finfo->fprev)
+        finfo->fprev->fnext = finfo->fnext;
+    if (finfo->fnext)
+        finfo->fnext->fprev = finfo->fprev;
+    if (*fhead == finfo)
+        *fhead = finfo->fnext;
+    if (*ftail == finfo)
+        *ftail = finfo->fprev;
+}
+
+
+/* only be used for sorting files at recovery */
+typedef struct {
+    uint32_t fid;
+    uint64_t oidf;
+} VIOAPND2SREORDER;
+
+
+static int _vio_apnd2_cmpfuncsreorder(const void *p1, const void *p2)
+{
+    VIOAPND2SREORDER *s1, *s2;
+    s1 = (VIOAPND2SREORDER *)p1;
+    s2 = (VIOAPND2SREORDER *)p2;
+    return s1->oidf - s2->oidf;
+}
+
+
+/* recovery the database if it was not close properly
+ * or force recovery from roid = 0
+ * the procedure runs with no lock protection */
+static int _vio_apnd2_recovery(CDBVIO *vio, bool force)
+{
+    VIOAPND2 *myio = (VIOAPND2 *)vio->iometa;
+    CDB *db = vio->db;
+    char filename[MAX_PATH_LEN];
+    struct dirent *filelist;
+    VIOAPND2SREORDER *idxorders;
+    int idxpos, idxlimit;
+    VIOAPND2SREORDER *datorders;
+    int datpos, datlimit;
+    uint32_t imaxfid = 0, dmaxfid = 0;
+    bool gotmindex = false;
+
+
+    idxpos = datpos = 0;
+    idxlimit = datlimit = 256;
+    idxorders = (VIOAPND2SREORDER *)malloc(idxlimit * sizeof(VIOAPND2SREORDER));
+    datorders = (VIOAPND2SREORDER *)malloc(datlimit * sizeof(VIOAPND2SREORDER));
+    DIR *dir = opendir(myio->filepath);
+    myio->dfnum = myio->ifnum = 0;
+    myio->datfhead = myio->datftail = myio->idxfhead = myio->idxftail = NULL;
+    /* special value to mark if found current writing file */
+    myio->ibuf.fid = myio->dbuf.fid = -1;
+    for (filelist = readdir(dir); filelist; filelist = readdir(dir)) {
+        // Check file name/type
+        const char *cstr = filelist->d_name;
+        if (strncmp(cstr + strlen(cstr) - 4, ".cdb", 4) != 0)
+            /* not a cuttdb file*/
+            continue;
+        if (strcmp(cstr, "dellog.cdb") == 0) {
+            snprintf(filename, MAX_PATH_LEN, "%s/%s", myio->filepath, cstr);
+            myio->dfd = open(filename, O_RDONLY, 0644);
+        } else if (strcmp(cstr, "mainindex.cdb") == 0) {
+            gotmindex = true;
+//            snprintf(filename, MAX_PATH_LEN, "%s/%s", myio->filepath, cstr);
+//            myio->hfd = open(filename, O_RDONLY, 0644);
+//            if (_vio_apnd2_readhead(vio, false) < 0 || db->hsize == 0) {
+//                goto ERRRET;
+//            }
+//            db->mtable = (FOFF *)malloc(sizeof(FOFF) * db->hsize);
+//            gotmindex = true;
+//            memset(db->mtable, 0, sizeof(FOFF) * db->hsize);
+        } else if (strcmp(cstr, "mainmeta.cdb") == 0) {
+            snprintf(filename, MAX_PATH_LEN, "%s/%s", myio->filepath, cstr);
+            myio->mfd = open(filename, O_RDWR, 0644);
+            if (myio->mfd < 0) {
+                cdb_seterrno(vio->db, CDB_OPENERR, __FILE__, __LINE__);
+                continue;
+            }
+        } else if (strlen(cstr) == 15
+            && (strncmp(cstr, "dat", 3) == 0 || strncmp(cstr, "idx", 3) == 0)) {
+            VIOAPND2FINFO finfo;
+            uint64_t fsize = 0;
+
+            snprintf(filename, MAX_PATH_LEN, "%s/%s", myio->filepath, cstr);
+            int fd = open(filename, O_RDWR, 0644);
+            if (fd < 0) {
+                cdb_seterrno(vio->db, CDB_OPENERR, __FILE__, __LINE__);
+                continue;
+            }
+            if (_vio_apnd2_readfmeta(vio, fd, &finfo) < 0) {
+                close(fd);
+                continue;
+            }
+            fsize = lseek(fd, 0, SEEK_END);
+            finfo.rcyled = 0;
+            finfo.ref = 0;
+            finfo.unlink = false;
+            finfo.fprev = finfo.fnext = NULL;
+            if (finfo.ftype == VIOAPND2_INDEX) {
+                if (force) {
+                    /* delete all index file and rebuild them if force to recovery */
+                    close(fd);
+                    unlink(filename);
+                } else {
+                    cdb_ht_insert2(myio->idxmeta, &finfo.fid, SI4, &finfo, sizeof(VIOAPND2FINFO));
+                    idxorders[idxpos].fid = finfo.fid;
+                    idxorders[idxpos].oidf = finfo.oidf;
+                    if (++idxpos == idxlimit) {
+                        VIOAPND2SREORDER *tmp = (VIOAPND2SREORDER *)malloc(idxlimit * 2 * sizeof(VIOAPND2SREORDER));
+                        memcpy(tmp, idxorders, idxlimit * sizeof(VIOAPND2SREORDER));
+                        idxlimit *= 2;
+                        free(idxorders);
+                        idxorders = tmp;
+                    }
+                    if(finfo.fstatus == VIOAPND2_WRITING) {
+                        myio->ibuf.fid = finfo.fid;
+                        myio->ibuf.off = OFFALIGNED(fsize);
+                        myio->ibuf.pos = 0;
+                        myio->ibuf.fd = fd;
+                    } else
+                        close(fd);
+                    if (finfo.fid > imaxfid)
+                        imaxfid = finfo.fid;
+                    myio->ifnum++;
+                }
+            } else if (finfo.ftype == VIOAPND2_DATA) {
+                /* no information about nearest expire record time, make a fake one(non zero) */
+                finfo.nexpire = finfo.lcktime = time(NULL);
+                cdb_ht_insert2(myio->datmeta, &finfo.fid, SI4, &finfo, sizeof(VIOAPND2FINFO));
+                datorders[datpos].fid = finfo.fid;
+                datorders[datpos].oidf = finfo.oidf;
+                if (++datpos == datlimit) {
+                    VIOAPND2SREORDER *tmp = (VIOAPND2SREORDER *)malloc(datlimit * 2 * sizeof(VIOAPND2SREORDER));
+                    memcpy(tmp, datorders, datlimit * sizeof(VIOAPND2SREORDER));
+                    datlimit *= 2;
+                    free(datorders);
+                    datorders = tmp;
+                }
+                if (finfo.fstatus == VIOAPND2_WRITING) {
+                    myio->dbuf.fid = finfo.fid;
+                    myio->dbuf.off = OFFALIGNED(fsize);
+                    myio->dbuf.pos = 0;
+                    myio->dbuf.fd = fd;
+                } else
+                    close(fd);
+                if (finfo.fid > dmaxfid)
+                    dmaxfid = finfo.fid;
+                myio->dfnum++;
+            } else
+                close(fd);
+        } /* end of else */
+    } /* end of for */
+
+
+    /* fix recycled size */
+    _vio_apnd2_readmeta(vio, true);
+    closedir(dir);
+
+    if (!gotmindex) {
+        /* recovery failed */
+        /* return */
+        goto ERRRET;
+    } else {
+        if (_vio_apnd2_readhead(vio, false) < 0)
+            goto ERRRET;
+    }
+
+    if (myio->mfd < 0) {
+        snprintf(filename, MAX_PATH_LEN, "%s/mainmeta.cdb", myio->filepath);
+        myio->mfd = open(filename, O_RDWR | O_CREAT, 0644);
+        if (myio->mfd < 0) {
+            cdb_seterrno(vio->db, CDB_OPENERR, __FILE__, __LINE__);
+            goto ERRRET;
+        }
+    }
+
+    /* index file complele broken, replay all records to build the index */
+    if (myio->ifnum == 0 || force)
+        db->roid = 0;
+    /* re-count records num */
+    db->rnum = 0;
+
+    /* fix index/data file meta relation */
+    qsort(datorders, datpos, sizeof(VIOAPND2SREORDER), _vio_apnd2_cmpfuncsreorder);
+    qsort(idxorders, idxpos, sizeof(VIOAPND2SREORDER), _vio_apnd2_cmpfuncsreorder);
+
+    VIOAPND2FINFO *lfinfo = NULL;
+    for(int i = 0; i < datpos; i++) {
+        VIOAPND2FINFO *cfinfo = (VIOAPND2FINFO *)cdb_ht_get2(myio->datmeta, &datorders[i].fid, SI4, false);
+        if (cfinfo == NULL)
+            continue;
+        if (lfinfo)
+            lfinfo->fnext = cfinfo;
+        else {
+            myio->datfhead = cfinfo;
+        }
+        cfinfo->fprev = lfinfo;
+        lfinfo = cfinfo;
+    }
+    myio->datftail = lfinfo;
+    if (lfinfo)
+        lfinfo->fnext = NULL;
+    lfinfo = NULL;
+    for(int i = 0; i < idxpos; i++) {
+        VIOAPND2FINFO *cfinfo = (VIOAPND2FINFO *)cdb_ht_get2(myio->idxmeta, &idxorders[i].fid, SI4, false);
+        if (cfinfo == NULL)
+            continue;
+        if (lfinfo)
+            lfinfo->fnext = cfinfo;
+        else {
+            myio->idxfhead = cfinfo;
+        }
+        cfinfo->fprev = lfinfo;
+        lfinfo = cfinfo;
+    }
+    myio->idxftail = lfinfo;
+    if (lfinfo)
+        lfinfo->fnext = NULL;
+    lfinfo = NULL;
+
+    if (myio->ibuf.fid == -1) {
+        myio->ibuf.fid = 0;
+        _vio_apnd2_shiftnew(vio, VIOAPND2_INDEX);
+    }
+    if (myio->dbuf.fid == -1) {
+        myio->dbuf.fid = 0;
+        _vio_apnd2_shiftnew(vio, VIOAPND2_DATA);
+    }
+
+    /* fix offsets in main index table */
+    db->mtable = (FOFF *)malloc(db->hsize * sizeof(FOFF));
+    memset(db->mtable, 0, db->hsize * sizeof(FOFF));
+    void *it = _vio_apnd2_pageiterfirst(vio, 0);
+    if (it) {
+        char sbuf[SBUFSIZE];
+        CDBPAGE *page = (CDBPAGE *)sbuf;
+        /* need not use iterator since don't care about contents in page */
+        /* I'm just lazy, cpu time is cheap */
+        while(_vio_apnd2_pageiternext(vio, &page, it) == 0) {
+            if (OFFNOTNULL(db->mtable[page->bid])) {
+                /* recalculate the space to be recycled */
+                uint32_t ofid, roff;
+                char sbuf[SBUFSIZE];
+                CDBPAGE *opage = (CDBPAGE *)sbuf;
+                _vio_apnd2_readpage(vio, &opage, db->mtable[page->bid]);
+                if (OFFNOTNULL(opage->ooff)) {
+                    VOFF2ROFF(opage->ooff, ofid, roff);
+                    VIOAPND2FINFO *finfo = (VIOAPND2FINFO *)cdb_ht_get2(myio->idxmeta, &ofid, SI4, false);
+                    if (finfo)
+                        finfo->rcyled += opage->osize;
+                }
+                /* fix impaction of old page */
+                db->rnum -= opage->num;
+                if (opage != (CDBPAGE *)sbuf)
+                    free(opage);
+            }
+            db->mtable[page->bid] = page->ooff;
+            db->rnum += page->num;
+            if (page != (CDBPAGE *)sbuf) {
+                free(page);
+                page = (CDBPAGE *)sbuf;
+            }
+        }
+        _vio_apnd2_pageiterdestory(vio, it);
+    }
+
+    /* like what was did just now */
+    it = _vio_apnd2_reciterfirst(vio, db->roid);
+    if (it) {
+        char sbuf[SBUFSIZE];
+        CDBREC *rec = (CDBREC *)sbuf;
+        while(_vio_apnd2_reciternext(vio, &rec, it) == 0) {
+            FOFF soffs[SFOFFNUM];
+            FOFF *soff = soffs, ooff;
+            char sbuf2[SBUFSIZE];
+            OFFZERO(ooff);
+            CDBREC *rrec = (CDBREC*)sbuf2;
+            uint64_t hash = CDBHASH64(rec->buf, rec->ksize);
+
+            /* check record with duplicate key(old version/overwritten maybe */
+            int retnum = cdb_getoff(db, hash, &soff, CDB_NOTLOCKED);
+            for(int i = 0; i < retnum; i++) {
+                if (rrec != (CDBREC*)sbuf2) {
+                    free(rrec);
+                    rrec = (CDBREC*)sbuf2;
+                }
+
+                int cret = _vio_apnd2_readrec(db->vio, &rrec, soff[i], false);
+                if (cret < 0)
+                    continue;
+
+                if (rec->ksize == rrec->ksize && memcmp(rrec->key, rec->key, rec->ksize) == 0) {
+                    ooff = rrec->ooff;
+                    break;
+                }
+            }
+            if (soff != soffs)
+                free(soff);
+            if (rrec != (CDBREC*)sbuf2)
+                free(rrec);
+
+            if (OFFNOTNULL(ooff))
+                /* replace offset in index */
+                cdb_replaceoff(db, hash, ooff, rec->ooff, CDB_NOTLOCKED);
+            else
+                cdb_updatepage(vio->db, hash, rec->ooff, CDB_PAGEINSERTOFF, CDB_NOTLOCKED);
+
+            if (rec->oid > db->oid)
+                db->oid = rec->oid;
+            if (rec != (CDBREC *)sbuf) {
+                free(rec);
+                rec = (CDBREC *)sbuf;
+            }
+        }
+        _vio_apnd2_reciterdestory(vio, it);
+    }
+
+    /* replay deletion logs */
+    FOFF delitems[1024];
+    for(; myio->dfd > 0;) {
+        int ret = read(myio->dfd, delitems, 1024 * sizeof(FOFF));
+        if (ret > 0) {
+            for(int j = 0; j * sizeof(FOFF) < ret; j++) {
+                char sbuf[SBUFSIZE];
+                uint32_t ofid, roff;
+                CDBREC *rec = (CDBREC *)sbuf;
+                if (_vio_apnd2_readrec(vio, &rec, delitems[j], false) < 0)
+                    continue;
+                if (cdb_updatepage(db, CDBHASH64(rec->key, rec->ksize),
+                                   delitems[j], CDB_PAGEDELETEOFF, CDB_NOTLOCKED) == 0)
+                VOFF2ROFF(delitems[j], ofid, roff);
+                VIOAPND2FINFO *finfo = (VIOAPND2FINFO *)cdb_ht_get2(myio->datmeta, &ofid, SI4, false);
+                if (finfo)
+                    finfo->rcyled += rec->osize;
+                if (rec != (CDBREC *)sbuf)
+                    free(rec);
+            }
+        } else {
+            close(myio->dfd);
+            myio->dfd = -1;
+        }
+    }
+
+    cdb_flushalldpage(db);
+    _vio_apnd2_writemeta(vio);
+    _vio_apnd2_writehead(vio, true);
+    cdb_ht_clean(myio->idxmeta);
+    cdb_ht_clean(myio->datmeta);
+    free(idxorders);
+    free(datorders);
+    /* mfd / dfd will be opened again after this function, but hfd won't be */
+    myio->datfhead = myio->datftail = myio->idxfhead = myio->idxftail = NULL;
+    if (myio->ibuf.fd > 0)
+        close(myio->ibuf.fd);
+    if (myio->dbuf.fd > 0)
+        close(myio->dbuf.fd);
+    if (myio->mfd > 0)
+        close(myio->mfd);
+    if (myio->dfd > 0)
+        close(myio->dfd);
+    return 0;
+
+ERRRET:
+    closedir(dir);
+    if (myio->hfd > 0)
+        close(myio->hfd);
+    if (myio->mfd > 0)
+        close(myio->mfd);
+    if (myio->dfd > 0)
+        close(myio->dfd);
+    free(datorders);
+    free(idxorders);
+    return -1;
+}
+
+
+static VIOAPND2FINFO* _vio_apnd2_fileiternext(CDBVIO *vio, int dtype, uint64_t oid)
+{
+    VIOAPND2 *myio = (VIOAPND2 *)vio->iometa;
+    uint64_t foid = (uint64_t)-1;
+    CDBHTITEM *item;
+    CDBHASHTABLE *ht;
+    VIOAPND2FINFO *finfo = NULL;
+
+    if (dtype == VIOAPND2_INDEX)
+        ht = myio->idxmeta;
+    else if (dtype == VIOAPND2_DATA)
+        ht = myio->datmeta;
+    else
+        return NULL;
+
+    cdb_lock_lock(myio->lock);
+    item = cdb_ht_iterbegin(ht);
+    while(item) {
+        VIOAPND2FINFO *tfinfo = (VIOAPND2FINFO *)cdb_ht_itemval(ht, item);
+        if (tfinfo->oidf < foid && tfinfo->oidf >= oid) {
+            foid = tfinfo->oidf;
+            finfo = tfinfo;
+        }
+        item = cdb_ht_iternext(ht, item);
+    }
+    if (finfo)
+        finfo->ref++;
+    cdb_lock_unlock(myio->lock);
+    return finfo;
+}
+
+static int _vio_apnd2_iterfirst(CDBVIO *vio, VIOAPND2ITOR *it, int dtype, int64_t oid)
+{
+    VIOAPND2 *myio = (VIOAPND2 *)vio->iometa;
+    CDBHASHTABLE *tmpcache;
+    char filename[MAX_PATH_LEN];
+    char ipfx[] = "idx";
+    char dpfx[] = "dat";
+    char *pfx;
+
+    if (dtype == VIOAPND2_INDEX) {
+        pfx = ipfx;
+        tmpcache = myio->idxmeta;
+    } else if (dtype == VIOAPND2_DATA) {
+        pfx = dpfx;
+        tmpcache = myio->datmeta;
+    } else
+        return -1;
+
+    if (it->finfo == NULL)
+        it->finfo = _vio_apnd2_fileiternext(vio, dtype, oid);
+    if (it->finfo == NULL) {
+        return -1;
+    }
+
+    snprintf(filename, MAX_PATH_LEN, "%s/%s%08d.cdb", myio->filepath, pfx, it->finfo->fid);
+    it->fd = open(filename, O_RDONLY, 0644);
+    if (it->fd < 0) {
+        cdb_lock_lock(myio->lock);
+        it->finfo->ref--;
+        if (it->finfo->ref == 0 && it->finfo->unlink) {
+            /* unlink the file */
+            _vio_apnd2_unlink(vio, it->finfo, dtype);
+            cdb_ht_del2(tmpcache, &it->finfo->fid, SI4);
+        }
+        cdb_lock_unlock(myio->lock);
+        cdb_seterrno(vio->db, CDB_OPENERR, __FILE__, __LINE__);
+        return -1;
+    }
+
+    it->fsize = lseek(it->fd, 0, SEEK_END);
+    it->mmap = mmap(NULL, it->fsize, PROT_READ, MAP_PRIVATE, it->fd, 0);
+    it->off = FILEMETASIZE;
+    it->oid = oid;
+
+    while(it->off < it->fsize) {
+        if (dtype == VIOAPND2_INDEX) {
+            CDBPAGE *page = (CDBPAGE *)(it->mmap + it->off -(sizeof(CDBPAGE) - PAGEHSIZE));
+            if (page->magic != PAGEMAGIC) {
+                it->off += ALIGNBYTES;
+                continue;
+            }
+            if (page->oid >= oid)
+                break;
+            it->off += OFFALIGNED(PAGESIZE(page));
+        } else if (dtype == VIOAPND2_DATA) {
+            CDBREC *rec = (CDBREC *)(it->mmap + it->off -(sizeof(CDBREC) - RECHSIZE));
+            if (rec->magic != RECMAGIC && rec->magic != DELRECMAGIC) {
+                it->off += ALIGNBYTES;
+                continue;
+            }
+            if (rec->oid >= oid)
+                break;
+            it->off += OFFALIGNED(RECSIZE(rec));
+        }
+    }
+
+    if (it->off >= it->fsize) {
+        munmap(it->mmap, it->fsize);
+        close(it->fd);
+        cdb_lock_lock(myio->lock);
+        it->finfo->ref--;
+        if (it->finfo->ref == 0 && it->finfo->unlink) {
+            /* unlink the file */
+            _vio_apnd2_unlink(vio, it->finfo, dtype);
+            cdb_ht_del2(tmpcache, &it->finfo->fid, SI4);
+        }
+        cdb_lock_unlock(myio->lock);
+        return -1;
+    }
+    return 0;
+}
+
+
+static int _vio_apnd2_pageiternext(CDBVIO *vio, CDBPAGE **page, void *iter)
+{
+    VIOAPND2ITOR *it = (VIOAPND2ITOR *)iter;
+    CDBPAGE *cpage;
+    uint32_t fixbufsize = SBUFSIZE - (sizeof(CDBPAGE) - PAGEHSIZE);
+
+    for(;;) {
+        if (it->off >= it->fsize) {
+            it->oid = CDBMAX(it->oid, it->finfo->oidl);
+            _vio_apnd2_iterfree(vio, VIOAPND2_INDEX, it);
+            if (_vio_apnd2_iterfirst(vio, it, VIOAPND2_INDEX, it->oid) < 0)
+                return -1;
+        }
+        cpage = (CDBPAGE *)(it->mmap + it->off -(sizeof(CDBPAGE) - PAGEHSIZE));
+        if (cpage->magic != PAGEMAGIC) {
+            it->off += ALIGNBYTES;
+            continue;
+        }
+        if (PAGESIZE(cpage) <= fixbufsize)
+            memcpy(&(*page)->magic, &cpage->magic, PAGESIZE(cpage));
+        else {
+            *page = (CDBPAGE *)malloc(sizeof(CDBPAGE) + (*page)->num * sizeof(PITEM));
+            memcpy(&(*page)->magic, &cpage->magic, PAGESIZE(cpage));
+        }
+        (*page)->osize = PAGESIZE(cpage);
+        (*page)->cap = (*page)->num;
+        ROFF2VOFF(it->finfo->fid, it->off, (*page)->ooff);
+        /* set iterator to next one */
+        it->oid = (*page)->oid + 1;
+        it->off += OFFALIGNED(PAGESIZE(cpage));
+        return 0;
+    }
+    return -1;
+}
+
+static int _vio_apnd2_reciternext(CDBVIO *vio, CDBREC **rec, void *iter)
+{
+    VIOAPND2ITOR *it = (VIOAPND2ITOR *)iter;
+    CDBREC *crec;
+    uint32_t fixbufsize = SBUFSIZE - (sizeof(CDBREC) - RECHSIZE);
+
+    for(;;) {
+        if (it->off >= it->fsize) {
+            it->oid = CDBMAX(it->oid, it->finfo->oidl);
+            _vio_apnd2_iterfree(vio, VIOAPND2_DATA, it);
+            if (_vio_apnd2_iterfirst(vio, it, VIOAPND2_DATA, it->oid) < 0)
+                return -1;
+        }
+        crec = (CDBREC *)(it->mmap + it->off -(sizeof(CDBREC) - RECHSIZE));
+        if (crec->magic != RECMAGIC && crec->magic != DELRECMAGIC) {
+            it->off += ALIGNBYTES;
+            continue;
+        }
+        if (RECSIZE(crec) <= fixbufsize)
+            memcpy(&(*rec)->magic, &crec->magic, RECSIZE(crec));
+        else {
+            *rec = (CDBREC *)malloc(sizeof(CDBREC) + crec->ksize + crec->vsize);
+            memcpy(&(*rec)->magic, &crec->magic, RECSIZE(crec));
+        }
+
+        (*rec)->osize = RECSIZE(crec);
+        (*rec)->expire = crec->expire;
+        ROFF2VOFF(it->finfo->fid, it->off, (*rec)->ooff);
+        (*rec)->key = (*rec)->buf;
+        (*rec)->val = (*rec)->buf + (*rec)->ksize;
+
+        /* set iterator to next one */
+        it->oid = (*rec)->oid + 1;
+        it->off += OFFALIGNED(RECSIZE(crec));
+        return 0;
+    }
+    return -1;
+}
+
+
+static int _vio_apnd2_iterfree(CDBVIO *vio, int dtype, VIOAPND2ITOR *it)
+{
+    VIOAPND2 *myio = (VIOAPND2 *)vio->iometa;
+    if (it->mmap) {
+        munmap(it->mmap, it->fsize);
+        close(it->fd);
+        cdb_lock_lock(myio->lock);
+        it->finfo->ref--;
+        if (it->finfo->ref == 0 && it->finfo->unlink) {
+            /* unlink the file */
+            VIOAPND2FINFO *tfinfo;
+            it->finfo->fnext->fprev = it->finfo->fprev;
+            it->finfo->fprev->fnext = it->finfo->fnext;
+            tfinfo = it->finfo;
+            it->finfo = it->finfo->fnext;
+            _vio_apnd2_unlink(vio, tfinfo, dtype);
+            if (dtype == VIOAPND2_INDEX)
+                cdb_ht_del2(myio->idxmeta, &tfinfo->fid, SI4);
+            else if (dtype == VIOAPND2_DATA)
+                cdb_ht_del2(myio->datmeta, &tfinfo->fid, SI4);
+        } else
+            it->finfo = it->finfo->fnext;
+        if (it->finfo)
+            it->finfo->ref++;
+        cdb_lock_unlock(myio->lock);
+        it->mmap = NULL;
+    }
+    return 0;
+}
+
+
+static void* _vio_apnd2_reciterfirst(CDBVIO *vio, uint64_t oid)
+{
+    VIOAPND2ITOR *it = (VIOAPND2ITOR *)malloc(sizeof(VIOAPND2ITOR));
+
+    /* iterator won't get to buffered data */
+    _vio_apnd2_flushbuf(vio, VIOAPND2_DATA);
+    it->mmap = NULL;
+    it->finfo = NULL;
+    if (_vio_apnd2_iterfirst(vio, it, VIOAPND2_DATA, oid) < 0) {
+        free(it);
+        return NULL;
+    }
+    return (void*)it;
+}
+
+
+static void _vio_apnd2_reciterdestory(CDBVIO *vio, void *iter)
+{
+    if (iter) {
+        _vio_apnd2_iterfree(vio, VIOAPND2_DATA, (VIOAPND2ITOR *)iter);
+        free(iter);
+    }
+}
+
+static void* _vio_apnd2_pageiterfirst(CDBVIO *vio, uint64_t oid)
+{
+    VIOAPND2ITOR *it = (VIOAPND2ITOR *)malloc(sizeof(VIOAPND2ITOR));
+
+    /* iterator won't get to buffered data */
+    _vio_apnd2_flushbuf(vio, VIOAPND2_INDEX);
+    it->mmap = NULL;
+    it->finfo = NULL;
+    if (_vio_apnd2_iterfirst(vio, it, VIOAPND2_INDEX, oid) < 0) {
+        free(it);
+        return NULL;
+    }
+    return (void*)it;
+}
+
+
+static void _vio_apnd2_pageiterdestory(CDBVIO *vio, void *iter)
+{
+    if (iter) {
+        _vio_apnd2_iterfree(vio, VIOAPND2_INDEX, (VIOAPND2ITOR *)iter);
+        free(iter);
+    }
+}
+
+static int _vio_apnd2_rcyledatafile(CDBVIO *vio, VIOAPND2FINFO *finfo, bool rcyle)
+{
+    VIOAPND2 *myio = (VIOAPND2 *)vio->iometa;
+    int fd;
+    char filename[MAX_PATH_LEN];
+    uint32_t nexpire = 0xffffffff;
+
+    snprintf(filename, MAX_PATH_LEN, "%s/dat%08d.cdb", myio->filepath, finfo->fid);
+    fd = open(filename, O_RDONLY, 0644);
+    if (fd < 0) {
+        cdb_seterrno(vio->db, CDB_OPENERR, __FILE__, __LINE__);
+        return -1;
+    }
+
+    uint32_t frsize = 0, fsize = lseek(fd, 0, SEEK_END);
+    uint32_t pos = FILEMETASIZE;
+    char *map = mmap(NULL, fsize, PROT_READ, MAP_PRIVATE, fd, 0);
+    uint32_t now = time(NULL);
+    while(pos < fsize) {
+        CDBREC *rec = (CDBREC *)&map[pos-(sizeof(CDBREC) - RECHSIZE)];
+        FOFF off;
+        uint64_t hash;
+
+        if (rec->magic != RECMAGIC && rec->magic != DELRECMAGIC) {
+            pos += ALIGNBYTES;
+            continue;
+        }
+
+        ROFF2VOFF(finfo->fid, pos, off);
+        hash = CDBHASH64(rec->buf, rec->ksize);
+        if (cdb_checkoff(vio->db, hash, off, CDB_NOTLOCKED)
+        /* not expired */
+        && (rec->expire > now || rec->expire == 0)) {
+            /* nearest expire record in current file */
+            if (rec->expire && rec->expire < nexpire)
+                nexpire = rec->expire;
+
+            /* record exist in index, skip */
+            if (rcyle) {
+                FOFF noff;
+                rec->ooff = off;
+                rec->osize = OFFALIGNED(RECSIZE(rec));
+                _vio_apnd2_writerecinternal(vio, rec, &noff);
+                cdb_replaceoff(vio->db, hash, off, noff, CDB_NOTLOCKED);
+            }
+        } else {
+            if (rcyle && rec->expire && rec->expire < now) {
+                /* expired record, delete from index page */
+                cdb_updatepage(vio->db, hash, off, CDB_PAGEDELETEOFF, CDB_NOTLOCKED);
+            }
+            frsize += OFFALIGNED(RECSIZE(rec));
+        }
+        pos += OFFALIGNED(RECSIZE(rec));
+    }
+    munmap(map, fsize);
+    close(fd);
+    cdb_lock_lock(myio->lock);
+    /* fix metainfo about nearest expire time in current data file */
+    if (nexpire == 0xffffffff)
+        finfo->nexpire = 0;
+    else
+        finfo->nexpire = nexpire;
+    finfo->rcyled = frsize;
+    if (rcyle) {
+        /* unlink */
+        finfo->unlink = true;
+    }
+    cdb_lock_unlock(myio->lock);
+    return 0;
+}
+
+
+static void _vio_apnd2_cleanpoint(CDBVIO *vio)
+{
+    VIOAPND2 *myio = (VIOAPND2 *)vio->iometa;
+    char filename[MAX_PATH_LEN];
+
+    cdb_lock_lock(myio->lock);
+    _vio_apnd2_flushbuf(vio, VIOAPND2_DATA);
+    _vio_apnd2_flushbuf(vio, VIOAPND2_INDEX);
+    _vio_apnd2_writehead(vio, false);
+    if (myio->dfd > 0)
+        close(myio->dfd);
+    snprintf(filename, MAX_PATH_LEN, "%s/dellog.cdb", myio->filepath);
+    /* clean the previous deletion log */
+    myio->dfd = open(filename, O_WRONLY | O_CREAT | O_TRUNC, 0644);
+    /* open failed, whom to tell? */
+    if (myio->dfd < 0)
+        cdb_seterrno(vio->db, CDB_OPENERR, __FILE__, __LINE__);
+    cdb_lock_unlock(myio->lock);
+}
+
+
+static int _vio_apnd2_checkopensig(CDBVIO *vio)
+{
+    VIOAPND2 *myio = (VIOAPND2 *)vio->iometa;
+    int pos = 0;
+    uint32_t ret;
+
+    if (myio->hfd < 0)
+        return -1;
+
+    pos += FILEMAGICLEN;
+    pos += SI4;
+    pos += SI8;
+    pos += SI8;
+    pos += SI8;
+    if (pread(myio->hfd, &ret, SI4, pos) != SI4)
+        return -1;
+
+    return ret;
+}
+
+
+static int _vio_apnd2_setopensig(CDBVIO *vio, int sig)
+{
+    VIOAPND2 *myio = (VIOAPND2 *)vio->iometa;
+    int pos = 0;
+    uint32_t val = sig;
+    if (myio->hfd < 0)
+        return -1;
+
+    pos += FILEMAGICLEN;
+    pos += SI4;
+    pos += SI8;
+    pos += SI8;
+    pos += SI8;
+    if (pwrite(myio->hfd, &val, SI4, pos) != SI4)
+        return -1;
+    return 0;
+}
+
+
diff --git a/libdap-cuttdb/src/vio_apnd2.h b/libdap-cuttdb/src/vio_apnd2.h
new file mode 100644
index 0000000000000000000000000000000000000000..cb47a7dbd18a0f40ad1f9a571b35b94feaeff6d5
--- /dev/null
+++ b/libdap-cuttdb/src/vio_apnd2.h
@@ -0,0 +1,23 @@
+/*
+ *   CuttDB - a fast key-value storage engine
+ *
+ *
+ *   http://code.google.com/p/cuttdb/
+ *   
+ *   Copyright (c) 2012, Siyuan Fu.  All rights reserved.
+ *   Use and distribution licensed under the BSD license. 
+ *   See the LICENSE file for full text
+ *
+ *   Author: Siyuan Fu <fusiyuan2010@gmail.com>
+ *
+ */
+
+
+#ifndef _VIO_APND2_H_
+#define _VIO_APND2_H_
+#include "cdb_vio.h"
+
+
+void vio_apnd2_init(CDBVIO *vio);
+
+#endif