diff --git a/CMakeLists.txt b/CMakeLists.txt index bc89b6933001a05f67334cf4e1b16e3784a9b55d..d240c96768cadf33c54f7b524e63dd814754587b 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -4,6 +4,8 @@ project (dap_chain_global_db) file(GLOB DAP_CHAIN_GLOBAL_DB_SRC *.c) file(GLOB DAP_CHAIN_GLOBAL_DB_HDR *.h) +add_subdirectory(libdap-cuttdb) + if(WIN32) include_directories(../libdap/src/win32/) include_directories(../3rdparty/libmemcached/) @@ -19,10 +21,10 @@ endif() add_library(${PROJECT_NAME} STATIC ${DAP_CHAIN_GLOBAL_DB_SRC} ${DAP_CHAIN_GLOBAL_DB_HDR}) if(WIN32) - target_link_libraries(dap_chain_global_db dap_core dap_crypto dap_chain dap_chain_crypto) + target_link_libraries(dap_chain_global_db dap_core dap_crypto dap_chain dap_chain_crypto dap_cuttdb) endif() if(UNIX) - target_link_libraries(dap_chain_global_db dap_core dap_crypto dap_chain dap_chain_crypto ldb talloc tevent sqlite3 ${CMAKE_CURRENT_SOURCE_DIR}/libcuttdb.a) + target_link_libraries(dap_chain_global_db dap_core dap_crypto dap_chain dap_chain_crypto ldb talloc tevent sqlite3 dap_cuttdb) endif() target_include_directories(dap_chain_global_db INTERFACE .) diff --git a/libcuttdb.a b/libcuttdb.a deleted file mode 100644 index eb6157bf89f6a454980eafe4af9f5a92651f63fe..0000000000000000000000000000000000000000 Binary files a/libcuttdb.a and /dev/null differ diff --git a/libdap-cuttdb/CMakeLists.txt b/libdap-cuttdb/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..4f309c1aba24d0fcb18a0f2658084ee0312894f6 --- /dev/null +++ b/libdap-cuttdb/CMakeLists.txt @@ -0,0 +1,28 @@ +cmake_minimum_required(VERSION 2.8) + +project(dap_cuttdb) + +add_definitions ("-D_GNU_SOURCE") +set(CMAKE_C_FLAGS "-std=gnu11 -Wall -Wextra -fPIC") + +file(GLOB cuttdb_src src/*.c) +file(GLOB cuttdb_h src/*.h) + +# the server part ain't ported, and thus not built, so are tests. +list(FILTER cuttdb_src EXCLUDE REGEX "ae.") +list(FILTER cuttdb_src EXCLUDE REGEX "server.") +list(FILTER cuttdb_src EXCLUDE REGEX "dump.") +list(FILTER cuttdb_src EXCLUDE REGEX "builddb.") +list(FILTER cuttdb_src EXCLUDE REGEX "test_mt.") + +if(UNIX) + list(FILTER cuttdb_src EXCLUDE REGEX "mman.") + list(FILTER cuttdb_h EXCLUDE REGEX "mman.") +endif() + +add_library(${PROJECT_NAME} STATIC ${cuttdb_h} ${cuttdb_src}) + +target_link_libraries(${PROJECT_NAME} -lpthread) + +target_include_directories(${PROJECT_NAME} INTERFACE src) + diff --git a/libdap-cuttdb/src/ae_epoll.c b/libdap-cuttdb/src/ae_epoll.c new file mode 100644 index 0000000000000000000000000000000000000000..ff8591d86c3d7962c1b01e6ad51b09144ba74f04 --- /dev/null +++ b/libdap-cuttdb/src/ae_epoll.c @@ -0,0 +1,109 @@ +/* Linux epoll(2) based ae.c module + * Copyright (C) 2009-2010 Salvatore Sanfilippo - antirez@gmail.com + * Released under the BSD license. See the COPYING file for more info. */ + +#include <sys/epoll.h> +#include <errno.h> + +typedef struct aeApiState { + int epfd; + struct epoll_event events[AE_SETSIZE]; +} aeApiState; + +static int aeApiCreate(EventLoop *eventLoop) { + aeApiState *state = malloc(sizeof(aeApiState)); + + if (!state) return -1; + state->epfd = epoll_create(1024); /* 1024 is just an hint for the kernel */ + if (state->epfd == -1) return -1; + eventLoop->apidata = state; + return 0; +} + +/* + be not referenced anywhere +static void aeApiFree(EventLoop *eventLoop) { + aeApiState *state = eventLoop->apidata; + + close(state->epfd); + free(state); +} +*/ + +static int aeApiAddEvent(EventLoop *eventLoop, int fd, int mask) { + aeApiState *state = eventLoop->apidata; + struct epoll_event ee; + ee.events = EPOLLONESHOT; + if (mask & AE_READABLE) ee.events |= EPOLLIN; + if (mask & AE_WRITABLE) ee.events |= EPOLLOUT; + ee.data.u64 = 0; /* avoid valgrind warning */ + ee.data.fd = fd; + if (epoll_ctl(state->epfd, EPOLL_CTL_ADD,fd,&ee) == -1 && errno != EEXIST) { + fprintf(stderr, "epoll_ctl(%d,%d) failed: %d\n", EPOLL_CTL_ADD,fd,errno); + return -1; + } + return 0; +} + +static int aeApiUpdateEvent(EventLoop *eventLoop, int fd, int mask) { + aeApiState *state = eventLoop->apidata; + struct epoll_event ee; + ee.events = EPOLLONESHOT; + if (mask & AE_READABLE) ee.events |= EPOLLIN; + if (mask & AE_WRITABLE) ee.events |= EPOLLOUT; + ee.data.u64 = 0; /* avoid valgrind warning */ + ee.data.fd = fd; + if (epoll_ctl(state->epfd, EPOLL_CTL_MOD,fd,&ee) == -1) { + fprintf(stderr, "epoll_ctl(%d,%d) failed: %d\n", EPOLL_CTL_ADD,fd,errno); + return -1; + } + return 0; +} + +static int aeApiDelEvent(EventLoop *eventLoop, int fd) { + aeApiState *state = eventLoop->apidata; + struct epoll_event ee; + + ee.events = 0; + ee.data.u64 = 0; /* avoid valgrind warning */ + ee.data.fd = fd; + /* Note, Kernel < 2.6.9 requires a non null event pointer even for + * EPOLL_CTL_DEL. */ + if ( epoll_ctl(state->epfd,EPOLL_CTL_DEL,fd,&ee) == -1 + && errno != ENOENT && errno != EBADF) { + fprintf(stderr, "epoll_ctl(%d,%d) failed: %d\n", EPOLL_CTL_DEL,fd,errno); + return -1; + } + return 0; +} + +int aeApiPoll(EventLoop *eventLoop, struct timeval *tvp) { + aeApiState *state = eventLoop->apidata; + int retval, numevents = 0; + + retval = epoll_wait(state->epfd,state->events,AE_SETSIZE, + tvp ? (tvp->tv_sec*1000 + tvp->tv_usec/1000) : -1); + if (retval > 0) { + int j; + + numevents = retval; + for (j = 0; j < numevents; j++) { + int mask = 0; + struct epoll_event *e = state->events+j; + + if (e->events & EPOLLIN) mask |= AE_READABLE; + if (e->events & EPOLLOUT) mask |= AE_WRITABLE; + eventLoop->fired[j] = e->data.fd; + } + } + return numevents; +} + + +/* + be not referenced anywhere +static char *aeApiName(void) { + return "epoll"; +} +*/ + diff --git a/libdap-cuttdb/src/ae_kqueue.c b/libdap-cuttdb/src/ae_kqueue.c new file mode 100644 index 0000000000000000000000000000000000000000..cd80a57be2d19d485f2a2ce3485b42494ba43640 --- /dev/null +++ b/libdap-cuttdb/src/ae_kqueue.c @@ -0,0 +1,91 @@ +/* Kqueue(2)-based ae.c module + * Copyright (C) 2009 Harish Mallipeddi - harish.mallipeddi@gmail.com + * Released under the BSD license. See the COPYING file for more info. */ + +#include <sys/types.h> +#include <sys/event.h> +#include <sys/time.h> + +typedef struct aeApiState { + int kqfd; + struct kevent events[AE_SETSIZE]; +} aeApiState; + +static int aeApiCreate(EventLoop *eventLoop) { + aeApiState *state = malloc(sizeof(aeApiState)); + + if (!state) return -1; + state->kqfd = kqueue(); + if (state->kqfd == -1) return -1; + eventLoop->apidata = state; + + return 0; +} + +static void aeApiFree(EventLoop *eventLoop) { + aeApiState *state = eventLoop->apidata; + + close(state->kqfd); + free(state); +} + +static int aeApiAddEvent(EventLoop *eventLoop, int fd, int mask) { + aeApiState *state = eventLoop->apidata; + struct kevent ke; + + if (mask & AE_READABLE) { + EV_SET(&ke, fd, EVFILT_READ, EV_ADD, 0, 0, NULL); + if (kevent(state->kqfd, &ke, 1, NULL, 0, NULL) == -1) return -1; + } + if (mask & AE_WRITABLE) { + EV_SET(&ke, fd, EVFILT_WRITE, EV_ADD, 0, 0, NULL); + if (kevent(state->kqfd, &ke, 1, NULL, 0, NULL) == -1) return -1; + } + return 0; +} + +static int aeApiUpdateEvent(EventLoop *eventLoop, int fd, int mask) { + return aeApiAddEvent(eventLoop, fd, mask); +} + +static int aeApiDelEvent(EventLoop *eventLoop, int fd) { + aeApiState *state = eventLoop->apidata; + struct kevent ke; + + EV_SET(&ke, fd, EVFILT_READ | EVFILT_WRITE, EV_DELETE, 0, 0, NULL); + kevent(state->kqfd, &ke, 1, NULL, 0, NULL); + return 0; +} + +static int aeApiPoll(EventLoop *eventLoop, struct timeval *tvp) { + aeApiState *state = eventLoop->apidata; + int retval, numevents = 0; + + if (tvp != NULL) { + struct timespec timeout; + timeout.tv_sec = tvp->tv_sec; + timeout.tv_nsec = tvp->tv_usec * 1000; + retval = kevent(state->kqfd, NULL, 0, state->events, AE_SETSIZE, &timeout); + } else { + retval = kevent(state->kqfd, NULL, 0, state->events, AE_SETSIZE, NULL); + } + + if (retval > 0) { + int j; + + numevents = retval; + for(j = 0; j < numevents; j++) { + int mask = 0; + struct kevent *e = state->events+j; + + if (e->filter == EVFILT_READ) mask |= AE_READABLE; + if (e->filter == EVFILT_WRITE) mask |= AE_WRITABLE; + eventLoop->fired[j] = e->ident; + } + } + return numevents; +} + +static char *aeApiName(void) { + return "kqueue"; +} diff --git a/libdap-cuttdb/src/ae_select.c b/libdap-cuttdb/src/ae_select.c new file mode 100644 index 0000000000000000000000000000000000000000..1e5d3ae91aa886a4b086ff07c28b9e10045ea292 --- /dev/null +++ b/libdap-cuttdb/src/ae_select.c @@ -0,0 +1,72 @@ +/* Select()-based ae.c module + * Copyright (C) 2009-2010 Salvatore Sanfilippo - antirez@gmail.com + * Released under the BSD license. See the COPYING file for more info. */ + +#include <string.h> + +typedef struct aeApiState { + fd_set rfds, wfds; + /* We need to have a copy of the fd sets as it's not safe to reuse + * FD sets after select(). */ + fd_set _rfds, _wfds; +} aeApiState; + +static int aeApiCreate(EventLoop *eventLoop) { + aeApiState *state = malloc(sizeof(aeApiState)); + + if (!state) return -1; + FD_ZERO(&state->rfds); + FD_ZERO(&state->wfds); + eventLoop->apidata = state; + return 0; +} + +static void aeApiFree(EventLoop *eventLoop) { + free(eventLoop->apidata); +} + +static int aeApiAddEvent(EventLoop *eventLoop, int fd, int mask) { + aeApiState *state = eventLoop->apidata; + + if (mask & AE_READABLE) FD_SET(fd,&state->rfds); + if (mask & AE_WRITABLE) FD_SET(fd,&state->wfds); + return 0; +} + +static void aeApiDelEvent(EventLoop *eventLoop, int fd, int mask) { + aeApiState *state = eventLoop->apidata; + + if (mask & AE_READABLE) FD_CLR(fd,&state->rfds); + if (mask & AE_WRITABLE) FD_CLR(fd,&state->wfds); +} + +static int aeApiPoll(EventLoop *eventLoop, struct timeval *tvp) { + aeApiState *state = eventLoop->apidata; + int retval, j, numevents = 0; + + memcpy(&state->_rfds,&state->rfds,sizeof(fd_set)); + memcpy(&state->_wfds,&state->wfds,sizeof(fd_set)); + + retval = select(eventLoop->maxfd+1, + &state->_rfds,&state->_wfds,NULL,tvp); + if (retval > 0) { + for (j = 0; j <= eventLoop->maxfd; j++) { + int mask = 0; + aeFileEvent *fe = &eventLoop->events[j]; + + if (fe->mask == AE_NONE) continue; + if (fe->mask & AE_READABLE && FD_ISSET(j,&state->_rfds)) + mask |= AE_READABLE; + if (fe->mask & AE_WRITABLE && FD_ISSET(j,&state->_wfds)) + mask |= AE_WRITABLE; + eventLoop->fired[numevents].fd = j; + eventLoop->fired[numevents].mask = mask; + numevents++; + } + } + return numevents; +} + +static char *aeApiName(void) { + return "select"; +} diff --git a/libdap-cuttdb/src/cdb_bgtask.c b/libdap-cuttdb/src/cdb_bgtask.c new file mode 100644 index 0000000000000000000000000000000000000000..822c02c1299c1f03fd5b738f50183e474dffce0a --- /dev/null +++ b/libdap-cuttdb/src/cdb_bgtask.c @@ -0,0 +1,128 @@ +/* + * CuttDB - a fast key-value storage engine + * + * + * http://code.google.com/p/cuttdb/ + * + * Copyright (c) 2012, Siyuan Fu. All rights reserved. + * Use and distribution licensed under the BSD license. + * See the LICENSE file for full text + * + * Author: Siyuan Fu <fusiyuan2010@gmail.com> + * + */ + + +#include "cdb_bgtask.h" +#include <stdlib.h> +#ifndef _WIN32 +#include <sys/signal.h> +#else +#include <signal.h> +#endif + + +/* where thread begins */ +static void *_cdb_bgtask_func(void *arg); + + +CDBBGTASK *cdb_bgtask_new() +{ + CDBBGTASK *bt = (CDBBGTASK *)malloc(sizeof(CDBBGTASK)); + + bt->tnum = 0; + bt->run = 0; + bt->tid = 0; + pthread_cond_init(&bt->scond, NULL); + pthread_mutex_init(&bt->smutex, NULL); + return bt; +} + + +/* add a task into task list, must called before the thread run */ +int cdb_bgtask_add(CDBBGTASK *bt, TASKFUNC func, void *arg, int intval) +{ + TASK *task = &bt->tasks[bt->tnum]; + + if (bt->tid || bt->tnum > MAXTASKNUM) + return -1; + + task->arg = arg; + task->func = func; + task->intval = intval; + task->ltime = time(NULL); + bt->tnum++; + return 0; +} + + +static void *_cdb_bgtask_func(void *arg) +{ + CDBBGTASK *bt = (CDBBGTASK *)arg; +#ifndef _WIN32 + /* block all signals coming into current thread */ + _sigset_t smask; + sigfillset(&smask); + pthread_sigmask(SIG_BLOCK, &smask, NULL); +#endif + /* loop */ + while(bt->run) { + time_t now = time(NULL); + struct timespec timeout; + + /* check should run some tasks every 1 second */ + timeout.tv_sec = now + 1; + timeout.tv_nsec = 0; + + /* iterate and run the tasks */ + for(int i = 0; i < bt->tnum; i++) { + TASK *task = &bt->tasks[i]; + if (now >= task->ltime + task->intval) { + task->func(task->arg); + task->ltime = now; + } + } + pthread_cond_timedwait(&bt->scond, &bt->smutex, &timeout); + } + + return NULL; +} + + +/* create a thread for tasks */ +void cdb_bgtask_start(CDBBGTASK *bt) +{ + if (bt->run) + return; + + bt->run = 1; + pthread_create(&bt->tid, NULL, _cdb_bgtask_func, bt); + return; +} + + +/* wait for the task thread exits */ +void cdb_bgtask_stop(CDBBGTASK *bt) +{ + if (bt->run) { + void **ret = NULL; + bt->run = 0; + pthread_cond_signal(&bt->scond); + pthread_join(bt->tid, ret); + } + + bt->tnum = 0; +} + + +void cdb_bgtask_destroy(CDBBGTASK *bt) +{ + cdb_bgtask_stop(bt); + pthread_cond_destroy(&bt->scond); + pthread_mutex_destroy(&bt->smutex); + free(bt); +} + + + + diff --git a/libdap-cuttdb/src/cdb_bgtask.h b/libdap-cuttdb/src/cdb_bgtask.h new file mode 100644 index 0000000000000000000000000000000000000000..6dee1b992d21bac8a496e7f8f89431c570b9358a --- /dev/null +++ b/libdap-cuttdb/src/cdb_bgtask.h @@ -0,0 +1,62 @@ +/* + * CuttDB - a fast key-value storage engine + * + * + * http://code.google.com/p/cuttdb/ + * + * Copyright (c) 2012, Siyuan Fu. All rights reserved. + * Use and distribution licensed under the BSD license. + * See the LICENSE file for full text + * + * Author: Siyuan Fu <fusiyuan2010@gmail.com> + * + */ + + +#ifndef _CDB_BGTASK_H_ +#define _CDB_BGTASK_H_ +#include <time.h> +#include <pthread.h> + + +/* 16 tasks at most in a task thread */ +#define MAXTASKNUM 16 + +typedef void (*TASKFUNC)(void *); + +/* struct for timer task */ +typedef struct { + /* task function */ + TASKFUNC func; + /* task argument */ + void *arg; + /* task run interval(seconds) */ + int intval; + /* time of last run */ + time_t ltime; +} TASK; + +/* struct for a background task manager */ +typedef struct CDBBGTASK +{ + TASK tasks[MAXTASKNUM]; + /* number of tasks */ + int tnum; + /* is running? */ + int run; + pthread_t tid; + /* for wait the thread exit */ + pthread_mutex_t smutex; + pthread_cond_t scond; +} CDBBGTASK; + + + +CDBBGTASK *cdb_bgtask_new(); +int cdb_bgtask_add(CDBBGTASK *task, TASKFUNC func, void *arg, int intval); +void cdb_bgtask_start(CDBBGTASK *bt); +void cdb_bgtask_stop(CDBBGTASK *task); +void cdb_bgtask_destroy(CDBBGTASK *task); + + +#endif diff --git a/libdap-cuttdb/src/cdb_bloomfilter.c b/libdap-cuttdb/src/cdb_bloomfilter.c new file mode 100644 index 0000000000000000000000000000000000000000..ebf5e2d3e8b508a20985b76bd6a1e974a89c9daf --- /dev/null +++ b/libdap-cuttdb/src/cdb_bloomfilter.c @@ -0,0 +1,158 @@ +/* + * CuttDB - a fast key-value storage engine + * + * + * http://code.google.com/p/cuttdb/ + * + * Copyright (c) 2012, Siyuan Fu. All rights reserved. + * Use and distribution licensed under the BSD license. + * See the LICENSE file for full text + * + * Author: Siyuan Fu <fusiyuan2010@gmail.com> + * + */ + + +#include "cdb_bloomfilter.h" +#include <stdlib.h> +#include <string.h> + +#define CDBBFHASHNUM 16 +#define CDBBFSPLITPOW 6 + +static uint64_t BFSEEDS[CDBBFHASHNUM] = {217636919,290182597,386910137,515880193, + 687840301,917120411,1222827239,1610612741, + 3300450239,3300450259,3300450281,3300450289, + 3221225473ul,4294967291ul,163227661,122420729,}; + +struct CDBBLOOMFILTER +{ + uint8_t *bitmap[1<<CDBBFSPLITPOW]; + uint64_t rnum; + uint64_t size; + int hnum; + int ratio; +}; + + +CDBBLOOMFILTER *cdb_bf_new(uint64_t rnum, uint64_t size) +{ + CDBBLOOMFILTER *bf = (CDBBLOOMFILTER *)malloc(sizeof(CDBBLOOMFILTER)); + bf->rnum = 0; + bf->size = size; + /* number of hash should be 0.7 * ratio */ + bf->hnum = size * 8 * 7 / (rnum * 10); + /* number of hash is limit in [1, 16] */ + if (bf->hnum > CDBBFHASHNUM) + bf->hnum = CDBBFHASHNUM; + if (bf->hnum == 0) + bf->hnum = 1; + /* avoid malloc too much memory once */ + for(int i = 0; i < (1 << CDBBFSPLITPOW); i++) { + bf->bitmap[i] = (uint8_t*)malloc(size >> CDBBFSPLITPOW); + memset(bf->bitmap[i], 0, size >> CDBBFSPLITPOW); + } + return bf; +} + + +void cdb_bf_set(CDBBLOOMFILTER *bf, void *key, int ksize) +{ + uint8_t *src = (uint8_t *)key, *end = src + ksize; + uint64_t hval[CDBBFHASHNUM] = {0}; + + for(;src < end; src++) + for(int i = 0; i < bf->hnum; i++) + hval[i] = hval[i] * BFSEEDS[i] + *src; + + for(int i = 0; i < bf->hnum; i++) { + uint64_t p = (hval[i] >> CDBBFSPLITPOW) % ((bf->size >> CDBBFSPLITPOW) << 3); + uint8_t *bitmap = bf->bitmap[hval[i] & ((1<<CDBBFSPLITPOW) - 1)]; + bitmap[p >> 3] |= (1 << (p & 0x07)); + } + + bf->rnum++; +} + + +bool cdb_bf_exist(CDBBLOOMFILTER *bf, void *key, int ksize) +{ + uint8_t *src = (uint8_t *)key, *end = src + ksize; + uint64_t hval[CDBBFHASHNUM] = {0}; + int exist = 0; + + for(;src < end; src++) + for(int i = 0; i < bf->hnum; i++) + hval[i] = hval[i] * BFSEEDS[i] + *src; + + for(int i = 0; i < bf->hnum; i++) { + uint64_t p = (hval[i] >> CDBBFSPLITPOW) % ((bf->size >> CDBBFSPLITPOW) << 3); + uint8_t *bitmap = bf->bitmap[hval[i] & ((1<<CDBBFSPLITPOW) - 1)]; + if (bitmap[p >> 3] & (1 << (p & 0x07))) + exist++; + else + break; + } + + return (exist == bf->hnum); +} + +void cdb_bf_clean(CDBBLOOMFILTER *bf) +{ + for(int i = 0; i < (1 << CDBBFSPLITPOW); i++) + memset(bf->bitmap[i], 0, bf->size >> CDBBFSPLITPOW); + + bf->rnum = 0; +} + + +void cdb_bf_destroy(CDBBLOOMFILTER *bf) +{ + for(int i = 0; i < (1 << CDBBFSPLITPOW); i++) + free(bf->bitmap[i]); + free(bf); +} + + +#ifdef _UT_CDBBF_ +#include <stdio.h> +#include <stdlib.h> +#include "cdb_bloomfilter.h" + +int main(int argc, char *argv[]) +{ + int size = 1048576; + int rnum = 1048576; + if (argc > 1) + rnum = atoi(argv[1]); + if (argc > 2) + size = atoi(argv[2]); + + CDBBLOOMFILTER *bf = cdb_bf_new(rnum, size); + for(int i = 0; i < rnum; i++) { + int j = 2 * i; + cdb_bf_set(bf, &j, 4); + } + + int exist = 0; + for(int i = 0; i < rnum; i++) { + int j = 2 * i; + if (cdb_bf_exist(bf, &j, 4)) + exist++; + } + printf("right positive: %.2f%%%%\n", (float)exist/(float)rnum*10000); + + exist = 0; + for(int i = 0; i < rnum * 2; i++) { + int j = 2 * i + 1; + if (cdb_bf_exist(bf, &j, 4)) + exist++; + } + + printf("false positive: %.2f%%%% %d/%d\n", (float)exist/(float)rnum*5000, exist, rnum * 2); + printf("element num: %d\n", bf->rnum); + cdb_bf_destroy(bf); + return 0; +} +#endif + diff --git a/libdap-cuttdb/src/cdb_bloomfilter.h b/libdap-cuttdb/src/cdb_bloomfilter.h new file mode 100644 index 0000000000000000000000000000000000000000..6ccdab1fccc1d92d843f072046550741a5cdaf37 --- /dev/null +++ b/libdap-cuttdb/src/cdb_bloomfilter.h @@ -0,0 +1,34 @@ +/* + * CuttDB - a fast key-value storage engine + * + * + * http://code.google.com/p/cuttdb/ + * + * Copyright (c) 2012, Siyuan Fu. All rights reserved. + * Use and distribution licensed under the BSD license. + * See the LICENSE file for full text + * + * Author: Siyuan Fu <fusiyuan2010@gmail.com> + * + */ + + +/* +Bloom Filter is currently not used in cuttdb +*/ +#ifndef _CDB_BLOOMFILTER_H_ +#define _CDB_BLOOMFILTER_H_ +#include <stdbool.h> +#include <stdint.h> + +typedef struct CDBBLOOMFILTER CDBBLOOMFILTER; + +#define CDBBFRATIO 8 + +CDBBLOOMFILTER *cdb_bf_new(uint64_t rnum, uint64_t size); +void cdb_bf_set(CDBBLOOMFILTER *bf, void *key, int ksize); +bool cdb_bf_exist(CDBBLOOMFILTER *bf, void *key, int ksize); +void cdb_bf_clean(CDBBLOOMFILTER *bf); +void cdb_bf_destroy(CDBBLOOMFILTER *bf); + +#endif diff --git a/libdap-cuttdb/src/cdb_builddb.c b/libdap-cuttdb/src/cdb_builddb.c new file mode 100644 index 0000000000000000000000000000000000000000..fc5f18dc68c9dbd78de89441d88ada791a8b97a5 --- /dev/null +++ b/libdap-cuttdb/src/cdb_builddb.c @@ -0,0 +1,72 @@ +/* + * CuttDB - a fast key-value storage engine + * + * + * http://code.google.com/p/cuttdb/ + * + * Copyright (c) 2012, Siyuan Fu. All rights reserved. + * Use and distribution licensed under the BSD license. + * See the LICENSE file for full text + * + * Author: Siyuan Fu <fusiyuan2010@gmail.com> + * + */ + +#include "cuttdb.h" +#include <stdio.h> +#include <stdlib.h> +#include <stdint.h> +#include <time.h> + +int main(int argc, char *argv[]) +{ + CDB *db = cdb_new(); + if (argc < 2) { + fprintf(stderr, "Usage: %s db_path [hsize = 2000000]\n", argv[0]); + return 0; + } + + /* 1TB memory limit(unlimited) */ + cdb_option(db, argc >= 3? atoi(argv[2]):2000000 , 0, 1048576); + cdb_seterrcb(db, cdb_deferrorcb, NULL); + if (cdb_open(db, argv[1], CDB_CREAT | CDB_PAGEWARMUP) < 0) { + return -1; + } + char *buf = NULL; + long count = 0; + + size_t size, size2; + while((size = getline(&buf, &size2, stdin)) != -1) { + /* remove the delimiter*/ + buf[--size] = '\0'; + int klen = -1; + int vlen = -1; + uint32_t expire = 0; + int parsenum = 0; + for(int i = 0; i < size; i++) { + if (buf[i] == '\t') { + if (klen == -1) + klen = i; + else { + vlen = i - klen - 1; + parsenum = 1; + } + } else if (buf[i] >= '0' && buf[i] <= '9' && parsenum) { + expire = expire * 10 + buf[i] - '0'; + } + } + + if (klen > 0 && vlen > 0) { + cdb_set2(db, buf, klen, buf + klen + 1, vlen, + CDB_OVERWRITE, expire > 0? expire - time(NULL): 0); + count++; + } + free(buf); + buf = NULL; + } + cdb_destroy(db); + fprintf(stderr, "imported %ld records\n", count); + return 0; +} + + diff --git a/libdap-cuttdb/src/cdb_core.c b/libdap-cuttdb/src/cdb_core.c new file mode 100644 index 0000000000000000000000000000000000000000..79356a0f2b25659da3e71e12e2e670af99b5d075 --- /dev/null +++ b/libdap-cuttdb/src/cdb_core.c @@ -0,0 +1,1396 @@ +/* + * CuttDB - a fast key-value storage engine + * + * + * http://code.google.com/p/cuttdb/ + * + * Copyright (c) 2012, Siyuan Fu. All rights reserved. + * Use and distribution licensed under the BSD license. + * See the LICENSE file for full text + * + * Author: Siyuan Fu <fusiyuan2010@gmail.com> + * + */ + + +#include "cuttdb.h" +#include "cdb_crc64.h" +#include "cdb_types.h" +#include "cdb_hashtable.h" +#include "cdb_bloomfilter.h" +#include "cdb_lock.h" +#include "cdb_bgtask.h" +#include "cdb_errno.h" +#include "cdb_vio.h" +#include "cdb_core.h" +#include <stdio.h> +#include <string.h> +#include <time.h> +#include <pthread.h> + +static void _cdb_pageout(CDB *db); +static void _cdb_defparam(CDB *db); +static void _cdb_recout(CDB *db); +static uint32_t _pagehash(const void *key, int len); +static void _cdb_flushdpagetask(void *arg); +static void _cdb_timerreset(struct timespec *ts); +static uint32_t _cdb_timermicrosec(struct timespec *ts); +static void _cdb_pagewarmup(CDB *db, bool loadbf); + + +/* it isn't necessary to rehash bid in hash table cache */ +static uint32_t _pagehash(const void *key, int len) +{ + return *(uint32_t*)key; +} + + +/* used to get the duration of a procedure */ +static void _cdb_timerreset(struct timespec *ts) +{ + clock_gettime(CLOCK_MONOTONIC, ts); +} + + +static uint32_t _cdb_timermicrosec(struct timespec *ts) +{ + struct timespec ts2; + uint32_t diff; + clock_gettime(CLOCK_MONOTONIC, &ts2); + diff = (ts2.tv_sec - ts->tv_sec) * 1000000; + diff += ts2.tv_nsec / 1000; + diff -= ts->tv_nsec / 1000; + return diff; +} + + +/* reset the parameters */ +static void _cdb_defparam(CDB *db) +{ + db->rnum = 0; + db->bfsize = 0; + db->rclimit = 128 * MB; + db->pclimit = 1024 * MB; + db->hsize = 1000000; + db->rcache = db->pcache = db->dpcache = NULL; + db->bf = NULL; + db->opened = false; + db->vio = NULL; + db->mtable = NULL; + db->oid = 0; + db->roid = 0; + db->errcbarg = NULL; + db->errcb = NULL; + db->areadsize = 4 * KB; + return; +} + + +/* flush all dirty pages */ +void cdb_flushalldpage(CDB *db) +{ + if (db->dpcache) { + while (db->dpcache->num) { + CDBHTITEM *item = cdb_ht_poptail(db->dpcache); + uint32_t bid = *(uint32_t*)cdb_ht_itemkey(db->dpcache, item); + FOFF off; + db->vio->wpage(db->vio, (CDBPAGE*)cdb_ht_itemval(db->dpcache, item), &off); + db->mtable[bid] = off; + free(item); + } + + db->roid = db->oid; + db->vio->cleanpoint(db->vio); + } +} + + +/* flush oldest dirty index page to disk, it runs in another thread and triggered by timer */ +static void _cdb_flushdpagetask(void *arg) +{ + CDB *db = (CDB *)arg; + CDBHTITEM *item; + CDBPAGE *page; + time_t now = time(NULL); + bool cleandcache = false; + uint32_t bid; + + if (!db->dpcache) + /* no dirty page cache */ + return; + + /* if there isn't too much dirty page and some time passed since last clean, + write out all dirty pages to make a recovery point(oid) */ + if (db->dpcache->num < 1024 && now > db->ndpltime + 120) + cleandcache = true; + + while(db->dpcache->num) { + FOFF off; + cdb_lock_lock(db->dpclock); + item = cdb_ht_gettail(db->dpcache); + /* no item in dpcache after lock */ + if (item == NULL) { + cdb_lock_unlock(db->dpclock); + return; + } + page = (CDBPAGE *)cdb_ht_itemval(db->dpcache, item); + /* bid = page->bid; also OK */ + bid = *(uint32_t*)cdb_ht_itemkey(db->dpcache, item); + /* been dirty for too long? */ + if (now > page->mtime + DPAGETIMEOUT || cleandcache) { + if (cdb_lock_trylock(db->mlock[page->bid % MLOCKNUM])) { + /* avoid dead lock, since dpclock is holding */ + cdb_lock_unlock(db->dpclock); + return; + } + /* remove it from dpcache */ + cdb_ht_poptail(db->dpcache); + cdb_lock_unlock(db->dpclock); + + /* write to disk */ + struct timespec ts; + _cdb_timerreset(&ts); + db->vio->wpage(db->vio, page, &off); + db->wcount++; + db->wtime += _cdb_timermicrosec(&ts); + db->mtable[bid] = off; + + /* move the clean page into pcache */ + cdb_lock_lock(db->pclock); + cdb_ht_insert(db->pcache, item); + cdb_lock_unlock(db->pclock); + cdb_lock_unlock(db->mlock[bid % MLOCKNUM]); + } else { + /* tail in dpcache isn't expired */ + cdb_lock_unlock(db->dpclock); + return; + } + } + + if (db->dpcache->num == 0 && cleandcache) + db->ndpltime = now; + + if (cleandcache) { + /* clean succeed if goes here, remember the recovery point */ + /* it's not necessary to lock */ + db->roid = db->oid; + db->vio->cleanpoint(db->vio); + } +} + + +/* fill the index page cache, and set the bloomfilter if necessary */ +static void _cdb_pagewarmup(CDB *db, bool loadbf) +{ + char sbuf[SBUFSIZE]; + void *it = db->vio->pageitfirst(db->vio, 0); + + if (it == NULL) + return; + + for(;;) { + CDBPAGE *page = (CDBPAGE *)sbuf; + if (db->vio->pageitnext(db->vio, &page, it) < 0) + break; + + /* the page is the newest one because its offset matches the one in main table */ + if (OFFEQ(page->ooff, db->mtable[page->bid])) { + if (loadbf) { + /* iterate key hashes in page, set to the filter */ + cdb_lock_lock(db->bflock); + for(uint32_t i = 0; i < page->num; i++) { + uint64_t hash = (page->bid << 24) | (page->items[i].hash.i2 << 8) + | (page->items[i].hash.i1); + /* bloom filter use the combined record hash as key */ + cdb_bf_set(db->bf, &hash, SI8); + } + cdb_lock_unlock(db->bflock); + } + + /* set the page to pcache if it doesn't exceed the limit size */ + if (db->pcache && db->pcache->size < db->pclimit) { + cdb_lock_lock(db->pclock); + cdb_ht_insert2(db->pcache, &page->bid, SI4, page, MPAGESIZE(page)); + cdb_lock_unlock(db->pclock); + } + } + /* the page may not be still in stack */ + if (page != (CDBPAGE *)sbuf) + free(page); + + if (!loadbf && (db->pcache && db->pcache->size > db->pclimit)) + break; + } + + db->vio->pageitdestroy(db->vio, it); +} + + +/* generate an incremental global operation id */ +uint64_t cdb_genoid(CDB *db) +{ + uint64_t oid; + cdb_lock_lock(db->oidlock); + oid = db->oid++; + cdb_lock_unlock(db->oidlock); + return oid; +} + + +/* get a new record iterator */ +void *cdb_iterate_new(CDB *db, uint64_t oid) +{ + return db->vio->recitfirst(db->vio, oid); +} + + + +/* iterate the database by callback */ +uint64_t cdb_iterate(CDB *db, CDB_ITERCALLBACK itcb, void *arg, void *iter) +{ + char sbuf[SBUFSIZE]; + uint64_t cnt = 0; + + if (iter == NULL) + return cnt; + for(;;) { + /* the rec is a copy from file, may in stack or allocated in heap */ + CDBREC *rec = (CDBREC *)sbuf; + bool ret = true; + if (db->vio->recitnext(db->vio, &rec, iter) < 0) + break; + + if (cdb_checkoff(db, CDBHASH64(rec->key, rec->ksize), rec->ooff, CDB_NOTLOCKED)) { + ret = itcb(arg, rec->key, rec->ksize, rec->val, rec->vsize, rec->expire, rec->oid); + cnt++; + } + if (rec != (CDBREC *)sbuf) + free(rec); + if (!ret) + break; + } + return cnt; +} + + + +/* destroy the iterator */ +void cdb_iterate_destroy(CDB *db, void *iter) +{ + db->vio->recitdestroy(db->vio, iter); +} + + +/* difficult to implement */ +/* +static void _cdb_rcachewarmup(CDB *db) +{ +} +*/ + + +CDB *cdb_new() +{ + CDB *db; + db = (CDB *)malloc(sizeof(CDB)); + /* I assume all operation in this layer is 'fast', so no mutex used here */ + for(int i = 0; i < MLOCKNUM; i++) + db->mlock[i] = cdb_lock_new(CDB_LOCKSPIN); + db->dpclock = cdb_lock_new(CDB_LOCKSPIN); + db->pclock = cdb_lock_new(CDB_LOCKSPIN); + db->rclock = cdb_lock_new(CDB_LOCKSPIN); + db->stlock = cdb_lock_new(CDB_LOCKSPIN); + db->oidlock = cdb_lock_new(CDB_LOCKSPIN); + db->bflock = cdb_lock_new(CDB_LOCKSPIN); + db->bgtask = cdb_bgtask_new(); + /* every thread should has its own errno */ + db->errkey = (pthread_key_t *)malloc(sizeof(pthread_key_t)); + pthread_key_create(db->errkey, NULL); + /* set default parameter */ + _cdb_defparam(db); + return db; +} + + +int cdb_option(CDB *db, int bnum, int rcacheMB, int pcacheMB) +{ + /* too small bnum is not allowed */ + db->hsize = bnum > 4096? bnum : 4096; + + if (rcacheMB >= 0) + db->rclimit = (uint64_t)rcacheMB * MB; + if (pcacheMB >= 0) + db->pclimit = (uint64_t)pcacheMB * MB; + return 0; +} + + +void cdb_option_bloomfilter(CDB *db, uint64_t size) +{ + db->bfsize = size; +} + +void cdb_option_areadsize(CDB *db, uint32_t size) +{ + db->areadsize = size; + if (db->areadsize < 1 * KB) + db->areadsize = 1 * KB; + + if (db->areadsize > SBUFSIZE - (sizeof(CDBREC) - RECHSIZE)) + db->areadsize = SBUFSIZE - (sizeof(CDBREC) - RECHSIZE); +} + +int cdb_open(CDB *db, const char *file_name, int mode) +{ + /* if will become into a hash table when file_name == CDB_MEMDB */ + int memdb = (strcmp(file_name, CDB_MEMDB) == 0); + + if (db->rclimit) + /* record cache is enabled */ + db->rcache = cdb_ht_new(true, NULL); + else if (memdb) { + /* record cache is disabled, but in MEMDB mode */ + cdb_seterrno(db, CDB_MEMDBNOCACHE, __FILE__, __LINE__); + goto ERRRET; + } + + if (db->pclimit && !memdb) { + /* page cache enabled. page cache is meaningless under MEMDB mode */ + db->dpcache = cdb_ht_new(true, _pagehash); + db->pcache = cdb_ht_new(true, _pagehash); + } + + + if (!memdb) { + if (db->bfsize) { + /* bloom filter enabled */ + db->bf = cdb_bf_new(db->bfsize, db->bfsize); + } + /* now only one storage format is supported */ + db->vio = cdb_vio_new(CDBVIOAPND2); + db->vio->db = db; + if (db->vio->open(db->vio, file_name, mode) < 0) + goto ERRRET; + if (db->vio->rhead(db->vio) < 0) { + db->mtable = (FOFF*)malloc(sizeof(FOFF) * db->hsize); + memset(db->mtable, 0, sizeof(FOFF) * db->hsize); + } + /* dirty index page would be swap to disk by timer control */ + cdb_bgtask_add(db->bgtask, _cdb_flushdpagetask, db, 1); + db->ndpltime = time(NULL); + /* start background task thread */ + cdb_bgtask_start(db->bgtask); + } else { + /* no persistent storage under MEMDB mode */ + db->vio = NULL; + db->bgtask = NULL; + db->mtable = NULL; + } + + if (db->bf || ((mode & CDB_PAGEWARMUP) && db->pcache)) { + /* fill the bloom filter if it is enabled, and fill the page cache */ + _cdb_pagewarmup(db, !!db->bf); + } + + /* reset the statistic info */ + cdb_stat(db, NULL); + db->opened = true; + return 0; + +ERRRET: + if (db->rcache) + cdb_ht_destroy(db->rcache); + if (db->pcache) + cdb_ht_destroy(db->pcache); + if (db->dpcache) + cdb_ht_destroy(db->dpcache); + if (db->bf) + cdb_bf_destroy(db->bf); + cdb_bgtask_stop(db->bgtask); + _cdb_defparam(db); + return -1; +} + + +/* check if the page cache size exceed the limit. clean oldest page if necessary */ +static void _cdb_pageout(CDB *db) +{ + while (PCOVERFLOW(db)) { + if (db->pcache->num) { + /* clean page cache is prior */ + cdb_lock_lock(db->pclock); + cdb_ht_removetail(db->pcache); + cdb_lock_unlock(db->pclock); + } else if (db->dpcache->num) { + CDBHTITEM *item; + uint32_t bid; + FOFF off; + cdb_lock_lock(db->dpclock); + item = cdb_ht_gettail(db->dpcache); + if (item == NULL) { + cdb_lock_unlock(db->dpclock); + break; + } + + bid = *(uint32_t*)cdb_ht_itemkey(db->dpcache, item); + /* must lock the main table inside the dpclock protection */ + if (cdb_lock_trylock(db->mlock[bid % MLOCKNUM]) < 0) { + /* avoid dead lock since dpclock is holding */ + cdb_lock_unlock(db->dpclock); + /* do nothing this time */ + break; + } + cdb_ht_poptail(db->dpcache); + cdb_lock_unlock(db->dpclock); + + /* write out dirty page */ + struct timespec ts; + _cdb_timerreset(&ts); + db->vio->wpage(db->vio, (CDBPAGE*)cdb_ht_itemval(db->dpcache, item), &off); + db->wcount++; + db->wtime += _cdb_timermicrosec(&ts); + db->mtable[bid] = off; + cdb_lock_unlock(db->mlock[bid % MLOCKNUM]); + free(item); + } + } +} + + +/* check if the record cache size exceed the limit. clean oldest record if necessary */ +static void _cdb_recout(CDB *db) +{ + while (RCOVERFLOW(db)) { + cdb_lock_lock(db->rclock); + if (db->rcache->num) + cdb_ht_removetail(db->rcache); + cdb_lock_unlock(db->rclock); + } +} + + +/* get all offsets from index(page) by key, even if only one of them at most is valid. + Others are due to the hash collision */ +int cdb_getoff(CDB *db, uint64_t hash, FOFF **offs, int locked) +{ + char sbuf[SBUFSIZE]; + CDBPAGE *page = NULL; + int rnum; + bool incache = true; + uint32_t bid = (hash >> 24) % db->hsize; + PHASH phash; + + phash.i1 = hash & 0xff; + phash.i2 = (hash >> 8) & 0xffff; + + if (db->bf) { + uint64_t bfkey = (bid << 24) | (hash & 0xffffff); + /* check the key-hash in bloom filter? return now if not exist */ + cdb_lock_lock(db->bflock); + if (!cdb_bf_exist(db->bf, &bfkey, SI8)) { + cdb_lock_unlock(db->bflock); + return 0; + } + cdb_lock_unlock(db->bflock); + } + + if (locked == CDB_NOTLOCKED) cdb_lock_lock(db->mlock[bid % MLOCKNUM]); + /* page exists in clean page cache? */ + if (db->pcache) { + cdb_lock_lock(db->pclock); + page = cdb_ht_get2(db->pcache, &bid, SI4, true); + cdb_lock_unlock(db->pclock); + } + + /* not in pcache, exists in dirty page cache? */ + if (page == NULL && db->dpcache) { + cdb_lock_lock(db->dpclock); + page = cdb_ht_get2(db->dpcache, &bid, SI4, true); + cdb_lock_unlock(db->dpclock); + } + + if (page == NULL) { + /* not in dpcache either, read from disk */ + incache = false; + db->pcmiss++; + /* page stays in stack by default */ + page = (CDBPAGE *)sbuf; + if (OFFNOTNULL(db->mtable[bid])) { + /* page offset not null in main table */ + int ret; + struct timespec ts; + _cdb_timerreset(&ts); + ret = db->vio->rpage(db->vio, &page, db->mtable[bid]); + db->rcount++; + db->rtime += _cdb_timermicrosec(&ts); + + /* read page error, return */ + if (ret < 0) { + if (locked == CDB_NOTLOCKED) cdb_lock_unlock(db->mlock[bid % MLOCKNUM]); + if (page != (CDBPAGE *)sbuf) + free(page); + return -1; + } + } else { + /* no page in this bucket */ + page->cap = page->num = 0; + page->osize = 0; + OFFZERO(page->ooff); + } + } else { + db->pchit++; + } + + rnum = 0; + for(uint32_t i = 0; i < page->num; i++) { + /* compare every hash in the page */ + if (PHASHEQ(page->items[i].hash, phash)) { + (*offs)[rnum] = page->items[i].off; + /* result offset list stays in stack by default. Allocate one in heap if + it exceeds the limit */ + if (++rnum == SFOFFNUM) { + /* very little possibility goes here */ + FOFF *tmp = (FOFF*)malloc((page->num - i + SFOFFNUM + 1) * sizeof(FOFF)); + memcpy(tmp, *offs, SFOFFNUM * sizeof(FOFF)); + *offs = tmp; + } + } + } + + if (!incache) { + /* set into clean page cache if not exists before */ + if (db->pcache) { + cdb_lock_lock(db->pclock); + cdb_ht_insert2(db->pcache, &bid, SI4, page, MPAGESIZE(page)); + cdb_lock_unlock(db->pclock); + } + /* if page now points to heap memory, free it */ + if (page != (CDBPAGE *)sbuf) { + free(page); + } + } + if (locked == CDB_NOTLOCKED) cdb_lock_unlock(db->mlock[bid % MLOCKNUM]); + + /* check page cache overflow */ + if (PCOVERFLOW(db)) + _cdb_pageout(db); + + return rnum; +} + + +/* replace a specified record's offset, may be used at disk space recycling + off indicates its previous offset, noff is the new offset. return negative if not found */ +int cdb_replaceoff(CDB *db, uint64_t hash, FOFF off, FOFF noff, int locked) +{ + char sbuf[SBUFSIZE]; + CDBPAGE *page = NULL; + CDBHTITEM *pitem = NULL; + bool indpcache = false; + uint32_t bid = (hash >> 24) % db->hsize; + PHASH phash; + bool found = false; + + phash.i1 = hash & 0xff; + phash.i2 = (hash >> 8) & 0xffff; + + if (locked == CDB_NOTLOCKED) cdb_lock_lock(db->mlock[bid % MLOCKNUM]); + if (db->pcache) { + /* in clean page cache, since it would be modified, it should be deleted from pcache */ + cdb_lock_lock(db->pclock); + pitem = cdb_ht_del(db->pcache, &bid, SI4); + cdb_lock_unlock(db->pclock); + if (pitem) + page = (CDBPAGE *)cdb_ht_itemval(db->pcache, pitem); + } + if (page == NULL && db->dpcache) { + /* not in pcache, but in dirty page cache */ + cdb_lock_lock(db->dpclock); + page = cdb_ht_get2(db->dpcache, &bid, SI4, true); + cdb_lock_unlock(db->dpclock); + if (page) + indpcache = true; + } + if (page == NULL) { + /* not exists either, read from disk */ + db->pcmiss++; + page = (CDBPAGE *)sbuf; + if (OFFNOTNULL(db->mtable[bid])) { + int ret; + struct timespec ts; + _cdb_timerreset(&ts); + ret = db->vio->rpage(db->vio, &page, db->mtable[bid]); + db->rcount++; + db->rtime += _cdb_timermicrosec(&ts); + + if (ret < 0) { + if (locked == CDB_NOTLOCKED) cdb_lock_unlock(db->mlock[bid % MLOCKNUM]); + if (page != (CDBPAGE *)sbuf) + free(page); + return -1; + } + } else { + /* nullified the empty page */ + page->cap = page->num = 0; + page->osize = 0; + OFFZERO(page->ooff); + } + } else { + db->pchit++; + } + + /* check and modify */ + for(uint32_t i = 0; i < page->num; i++) { + if (PHASHEQ(page->items[i].hash, phash) + && OFFEQ(page->items[i].off, off)) { + page->items[i].off = noff; + found = true; + break; + } + } + + if (db->dpcache && !indpcache) { + /* if page already dirty in cache, need not do anything */ + /* dirty page cache is enabled but not exists before */ + if (pitem) { + /* pitem not NULL indicates it belongs to pcache */ + if (found) { + /* modified page */ + cdb_lock_lock(db->dpclock); + cdb_ht_insert(db->dpcache, pitem); + cdb_lock_unlock(db->dpclock); + } else { + /* got from pcache, but not modified */ + cdb_lock_lock(db->pclock); + cdb_ht_insert(db->pcache, pitem); + cdb_lock_unlock(db->pclock); + } + /* page belongs to memory in 'cache', must not free */ + } else if (page != NULL) { + /* page read from disk, but not in cache */ + cdb_lock_lock(db->dpclock); + cdb_ht_insert2(db->dpcache, &bid, SI4, page, MPAGESIZE(page)); + cdb_lock_unlock(db->dpclock); + /* the 'page' won't be use anymore */ + if (page != (CDBPAGE *)sbuf) + free(page); + } + } else if (!db->dpcache){ + /* no page cache. Write out dirty page immediately */ + FOFF poff; + struct timespec ts; + _cdb_timerreset(&ts); + db->vio->wpage(db->vio, page, &poff); + db->wcount++; + db->wtime += _cdb_timermicrosec(&ts); + + db->mtable[bid] = poff; + if (page != (CDBPAGE *)sbuf) + free(page); + } + if (locked == CDB_NOTLOCKED) cdb_lock_unlock(db->mlock[bid % MLOCKNUM]); + + /* check page cache overflow */ + if (PCOVERFLOW(db)) + _cdb_pageout(db); + + return 0; +} + + +/* insert/delete a key-offset pair from index page */ +int cdb_updatepage(CDB *db, uint64_t hash, FOFF off, int opt, int locked) +{ + char sbuf[SBUFSIZE], sbuf2[SBUFSIZE]; + CDBPAGE *page = NULL, *npage = NULL; + CDBHTITEM *pitem = NULL, *nitem = NULL; + CDBHASHTABLE *tmpcache = NULL; + CDBLOCK *tmpclock = NULL; + int npsize = 0; + uint32_t bid = (hash >> 24) % db->hsize; + PHASH phash; + + phash.i1 = hash & 0xff; + phash.i2 = (hash >> 8) & 0xffff; + + if (locked == CDB_NOTLOCKED) cdb_lock_lock(db->mlock[bid % MLOCKNUM]); + /* firstly, try move the page out of the cache if possible, + it assumes that the page would be modified(pair exists) */ + if (db->pcache) { + /* try clean page cache */ + cdb_lock_lock(db->pclock); + pitem = cdb_ht_del(db->pcache, &bid, SI4); + cdb_lock_unlock(db->pclock); + if (pitem) { + page = (CDBPAGE *)cdb_ht_itemval(db->pcache, pitem); + tmpcache = db->pcache; + tmpclock = db->pclock; + } + } + if (page == NULL && db->dpcache) { + /* try dirty page cache */ + cdb_lock_lock(db->dpclock); + pitem = cdb_ht_del(db->dpcache, &bid, SI4); + cdb_lock_unlock(db->dpclock); + if (pitem) { + page = (CDBPAGE *)cdb_ht_itemval(db->dpcache, pitem); + tmpcache = db->dpcache; + tmpclock = db->dpclock; + } + } + + if (page == NULL) { + db->pcmiss++; + page = (CDBPAGE *)sbuf; + /* doesn't exist in cache, read from disk */ + if (OFFNOTNULL(db->mtable[bid])) { + int ret; + struct timespec ts; + _cdb_timerreset(&ts); + ret = db->vio->rpage(db->vio, &page, db->mtable[bid]); + db->rcount++; + db->rtime += _cdb_timermicrosec(&ts); + + if (ret < 0) { + if (locked == CDB_NOTLOCKED) cdb_lock_unlock(db->mlock[bid % MLOCKNUM]); + if (page != (CDBPAGE *)sbuf) + free(page); + return -1; + } + } else { + page->cap = 0; + page->num = 0; + page->osize = 0; + OFFZERO(page->ooff); + } + } else { + db->pchit++; + } + + npsize = MPAGESIZE(page); + + if (opt == CDB_PAGEDELETEOFF) + ;// npsize = MPAGESIZE(page) - sizeof(PITEM); + /* do not malloc new page on deletion */ + + else if (opt == CDB_PAGEINSERTOFF && page->cap == page->num) { + /* get a new page, from dirty page cache if possible */ + npsize = MPAGESIZE(page) + CDB_PAGEINCR * sizeof(PITEM); + if (db->dpcache) { + nitem = cdb_ht_newitem(db->dpcache, SI4, npsize); + *(uint32_t*)cdb_ht_itemkey(db->dpcache, nitem) = bid; + npage = (CDBPAGE *)cdb_ht_itemval(db->dpcache, nitem); + } else { + /* no dpcache, use stack if size fits */ + if (npsize > SBUFSIZE) + npage = (CDBPAGE *)malloc(npsize); + else + npage = (CDBPAGE *)sbuf2; + } + + /* initialize the new page */ + + npage->bid = bid; + npage->oid = cdb_genoid(db); + npage->osize = page->osize; + npage->ooff = page->ooff; + npage->mtime = time(NULL); + npage->cap = page->cap + CDB_PAGEINCR; + npage->num = page->num; + memcpy(npage->items, page->items, page->num * sizeof(PITEM)); + /* old page got from cache */ + if (pitem) + free(pitem); + /* old page read from disk, if in stack? */ + else if (page != (CDBPAGE *)sbuf) + free(page); + + page = npage; + pitem = nitem; + } + + uint32_t onum = page->num; + + if (opt == CDB_PAGEDELETEOFF) { + bool found = false; + for(uint32_t i = 0; i < page->num; i++) { + if (!found) { + if (PHASHEQ(page->items[i].hash, phash) + && OFFEQ(page->items[i].off, off)) + { + found = true; + /* records num is consistant with index */ + cdb_lock_lock(db->stlock); + db->rnum--; + cdb_lock_unlock(db->stlock); + } + } + if (found && i + 1 < page->num) + page->items[i] = page->items[i+1]; + } + if (found) + page->num--; + } else if (opt == CDB_PAGEINSERTOFF) { + bool found = false; + /* check already exist? */ + for(uint32_t i = 0; i < page->num; i++) { + if (PHASHEQ(page->items[i].hash, phash) + && OFFEQ(page->items[i].off, off)) { + /* avoid exceptional deduplicated item */ + found = true; + break; + } + } + + /* append to the tail */ + if (!found) { + page->items[page->num].hash = phash; + page->items[page->num].off = off; + page->num++; + /* records num is consistant with index */ + cdb_lock_lock(db->stlock); + db->rnum++; + cdb_lock_unlock(db->stlock); + if (db->bf) { + uint64_t bfkey = (((hash >> 24) % db->hsize) << 24) | (hash & 0xffffff); + cdb_lock_lock(db->bflock); + cdb_bf_set(db->bf, &bfkey, SI8); + cdb_lock_unlock(db->bflock); + } + } + } + + if (page->num == onum) { + /* nothing done */ + if (pitem) { + /* insert the item back to the cache where it belongs */ + cdb_lock_lock(tmpclock); + cdb_ht_insert(tmpcache, pitem); + cdb_lock_unlock(tmpclock); + } else { + if (page != (CDBPAGE *)sbuf2 + && page != (CDBPAGE *)sbuf) + free(page); + } + if (locked == CDB_NOTLOCKED) cdb_lock_unlock(db->mlock[bid % MLOCKNUM]); + return -1; + } else { + if (pitem) { + cdb_lock_lock(db->dpclock); + cdb_ht_insert(db->dpcache, pitem); + cdb_lock_unlock(db->dpclock); + } else { + struct timespec ts; + _cdb_timerreset(&ts); + db->vio->wpage(db->vio, page, &off); + db->wcount++; + db->wtime += _cdb_timermicrosec(&ts); + + db->mtable[bid] = off; + if (page != (CDBPAGE *)sbuf2 + && page != (CDBPAGE *)sbuf) + free(page); + } + } + + if (locked == CDB_NOTLOCKED) cdb_lock_unlock(db->mlock[bid % MLOCKNUM]); + + /* check page cache overflow */ + if (PCOVERFLOW(db)) + _cdb_pageout(db); + + return 0; +} + + +/* check if an record with specified key-offset exists in index */ +bool cdb_checkoff(CDB *db, uint64_t hash, FOFF off, int locked) +{ + FOFF soffs[SFOFFNUM]; + FOFF *soff = (FOFF *)soffs; + int dupnum; + int ret = false; + + /* get all possible offsets */ + dupnum = cdb_getoff(db, hash, &soff, locked); + for(int i = 0; i < dupnum; i++) { + if (OFFEQ(soff[i], off)) { + ret = true; + break; + } + } + + if (soff != (FOFF *)soffs) { + free(soff); + } + + return ret; +} + + +/* wrapper and simplified of set operation */ +int cdb_set(CDB *db, const char *key, int ksize, const char *val, int vsize) +{ + return cdb_set2(db, key, ksize, val, vsize, CDB_OVERWRITE, 0); +} + + +int cdb_set2(CDB *db, const char *key, int ksize, const char *val, int vsize, int opt, int expire) +{ + CDBREC rec; + FOFF ooff, noff; + uint32_t now = time(NULL); + uint64_t hash; + uint32_t lockid; + bool expired = false; + + if (db->vio == NULL) { + /* if it is a memdb, just operate on the record cache and return */ + cdb_lock_lock(db->rclock); + cdb_ht_insert2(db->rcache, key, ksize, val, vsize); + cdb_lock_unlock(db->rclock); + if (RCOVERFLOW(db)) + _cdb_recout(db); + return 0; + } + + hash = CDBHASH64(key, ksize); + lockid = (hash >> 24) % db->hsize % MLOCKNUM; + OFFZERO(rec.ooff); + OFFZERO(ooff); + rec.osize = 0; + rec.key = (char*)key; + rec.val = (char*)val; + rec.ksize = ksize; + rec.vsize = vsize; + rec.oid = cdb_genoid(db); + rec.expire = expire? now + expire : 0; + + cdb_lock_lock(db->mlock[lockid]); + if (db->rcache) { + /* if record already exists, get its old meta info */ + int item_vsize; + char *cval; + uint32_t old_expire = 0; + cdb_lock_lock(db->rclock); + cval = cdb_ht_get(db->rcache, key, ksize, &item_vsize, false); + if (cval) { + /* record already exists */ + ooff = rec.ooff = *(FOFF*)cval; + rec.osize = item_vsize - SFOFF - SI4; + old_expire = *(uint32_t*)(cval + SFOFF); + } + cdb_lock_unlock(db->rclock); + if (old_expire && old_expire <= now) + /* once exist but expired? */ + expired = true; + } + + if (OFFNULL(ooff)) { + FOFF soffs[SFOFFNUM]; + FOFF *soff = soffs; + char sbuf[SBUFSIZE]; + CDBREC *rrec = (CDBREC*)sbuf; + + int retnum; + if ((retnum = cdb_getoff(db, hash, &soff, CDB_LOCKED)) < 0) { + cdb_lock_unlock(db->mlock[lockid]); + return -1; + } + + for(int i = 0; i < retnum; i++) { + /* check for duplicate records/older version*/ + int cret; + if (rrec != (CDBREC*)sbuf) { + free(rrec); + rrec = (CDBREC*)sbuf; + } + + struct timespec ts; + _cdb_timerreset(&ts); + cret = db->vio->rrec(db->vio, &rrec, soff[i], false); + db->rcount++; + db->rtime += _cdb_timermicrosec(&ts); + + if (cret < 0) + continue; + + if (ksize == rrec->ksize && memcmp(rrec->key, key, ksize) == 0) { + /* got its old meta info */ + rec.osize = rrec->osize; + rec.ooff = rrec->ooff; + ooff = rec.ooff; + if (rrec->expire <= now) + expired = true; + break; + } + } + if (soff != soffs) + free(soff); + if (rrec != (CDBREC*)sbuf) + free(rrec); + } + + if (OFFNOTNULL(ooff) && !expired) { + /* record already exists*/ + if (opt & CDB_INSERTIFNOEXIST) { + cdb_lock_unlock(db->mlock[lockid]); + cdb_seterrno(db, CDB_EXIST, __FILE__, __LINE__); + return -2; + } + } else { + if (opt & CDB_INSERTIFEXIST) { + cdb_lock_unlock(db->mlock[lockid]); + cdb_seterrno(db, CDB_NOTFOUND, __FILE__, __LINE__); + return -3; + } + } + + struct timespec ts; + _cdb_timerreset(&ts); + if (db->vio->wrec(db->vio, &rec, &noff) < 0) { + cdb_lock_unlock(db->mlock[lockid]); + return -1; + } + db->wcount++; + db->wtime += _cdb_timermicrosec(&ts); + + if (OFFNOTNULL(ooff)) { + cdb_replaceoff(db, hash, ooff, noff, CDB_LOCKED); + } else { + cdb_updatepage(db, hash, noff, CDB_PAGEINSERTOFF, CDB_LOCKED); + } + + if (db->rcache) { + if ((opt & CDB_INSERTCACHE) == CDB_INSERTCACHE) { + char *cval; + CDBHTITEM *item = cdb_ht_newitem(db->rcache, ksize, vsize + SI4 + SFOFF); + memcpy(cdb_ht_itemkey(db->rcache, item), key, ksize); + cval = cdb_ht_itemval(db->rcache, item); + memcpy(cval + SI4 + SFOFF, val, vsize); + *(FOFF*)(cval) = rec.ooff; + *(uint32_t*)(cval + SFOFF) = rec.expire; + cdb_lock_lock(db->rclock); + cdb_ht_insert(db->rcache, item); + cdb_lock_unlock(db->rclock); + } + } + cdb_lock_unlock(db->mlock[lockid]); + + if (RCOVERFLOW(db)) + _cdb_recout(db); + + cdb_seterrno(db, CDB_SUCCESS, __FILE__, __LINE__); + return 0; +} + + + +int cdb_get(CDB *db, const char *key, int ksize, void **val, int *vsize) +{ + char sbuf[SBUFSIZE]; + CDBREC *rec = (CDBREC *)sbuf; + FOFF soffs[SFOFFNUM]; + FOFF *offs; + int dupnum, ret = -3; + uint64_t hash; + uint32_t now = time(NULL); + uint32_t lockid; + + *vsize = 0; + *val = NULL; + if (db->rcache) { + char *cval; + cdb_lock_lock(db->rclock); + cval = cdb_ht_get(db->rcache, key, ksize, vsize, true); + if (cval) { + db->rchit++; + if (db->vio) { + (*vsize) -= SI4 + SFOFF; + if (*(uint32_t*)(cval + SFOFF) + && *(uint32_t*)(cval + SFOFF) <= now) { + cdb_lock_unlock(db->rclock); + /* not found no not report error now */ + //cdb_seterrno(db, CDB_NOTFOUND, __FILE__, __LINE__); + return -3; + } + cval = (void*)(cval + SI4 + SFOFF); + } + *val = malloc(*vsize); + memcpy(*val, cval, *vsize); + cdb_lock_unlock(db->rclock); + return 0; + } else { + db->rcmiss++; + if (db->vio == NULL) { + cdb_lock_unlock(db->rclock); + return -3; + } + } + cdb_lock_unlock(db->rclock); + } + + offs = soffs; + hash = CDBHASH64(key, ksize); + lockid = (hash >> 24) % db->hsize % MLOCKNUM; + cdb_lock_lock(db->mlock[lockid]); + dupnum = cdb_getoff(db, hash, &offs, CDB_LOCKED); + if (dupnum < 0) { + cdb_lock_unlock(db->mlock[lockid]); + return -1; + } + + for(int i = 0; i < dupnum; i++) { + int cret; + if (rec != (CDBREC*)sbuf) { + free(rec); + rec = (CDBREC*)sbuf; + } + + struct timespec ts; + _cdb_timerreset(&ts); + cret = db->vio->rrec(db->vio, &rec, offs[i], true); + db->rcount++; + db->rtime += _cdb_timermicrosec(&ts); + + if (cret < 0) + continue; + + if (ksize == rec->ksize && memcmp(rec->key, key, ksize) == 0) { + if (rec->expire && rec->expire <= now) { + break; + } + *vsize = rec->vsize; + *val = malloc(*vsize); + memcpy(*val, rec->val, *vsize); + ret = 0; + break; + } + } + + if (ret == 0 && db->rcache) { + char *cval; + CDBHTITEM *item = cdb_ht_newitem(db->rcache, ksize, *vsize + SI4 + SFOFF); + memcpy(cdb_ht_itemkey(db->rcache, item), key, ksize); + cval = cdb_ht_itemval(db->rcache, item); + memcpy(cval + SI4 + SFOFF, *val, *vsize); + *(FOFF*)(cval) = rec->ooff; + *(uint32_t*)(cval + SFOFF) = rec->expire; + cdb_lock_lock(db->rclock); + cdb_ht_insert(db->rcache, item); + cdb_lock_unlock(db->rclock); + } + cdb_lock_unlock(db->mlock[lockid]); + + if (RCOVERFLOW(db)) + _cdb_recout(db); + + if (offs != soffs) + free(offs); + + if (rec != (CDBREC*)sbuf) + free(rec); + + if (ret < 0) + cdb_seterrno(db, CDB_NOTFOUND, __FILE__, __LINE__); + else { + db->rcmiss++; + cdb_seterrno(db, CDB_SUCCESS, __FILE__, __LINE__); + } + return ret; +} + + +void cdb_free_val(void **val) +{ + if (*val) + free(*val); + *val = NULL; +} + + +int cdb_del(CDB *db, const char *key, int ksize) +{ + FOFF ooff; + CDBREC rec; + uint32_t lockid; + uint64_t hash; + + OFFZERO(rec.ooff); + OFFZERO(ooff); + rec.osize = 0; + rec.key = (char*)key; + rec.ksize = ksize; + rec.val = NULL; + rec.vsize = 0; + + if (db->vio == NULL) { + /* if it is a memdb, just operate on the record cache and return */ + cdb_lock_lock(db->rclock); + cdb_ht_del2(db->rcache, key, ksize); + cdb_lock_unlock(db->rclock); + if (RCOVERFLOW(db)) + _cdb_recout(db); + return 0; + } + + hash = CDBHASH64(key, ksize); + lockid = (hash >> 24) % db->hsize % MLOCKNUM; + cdb_lock_lock(db->mlock[lockid]); + if (db->rcache) { + /* if record already exists, get its old meta info */ + CDBHTITEM *item; + cdb_lock_lock(db->rclock); + item = cdb_ht_del(db->rcache, key, ksize); + cdb_lock_unlock(db->rclock); + if (item) { + char *cval = cdb_ht_itemval(db->rcache, item); + ooff = rec.ooff = *(FOFF*)cval; + rec.osize = item->vsize - SFOFF - SI4; + rec.expire = *(uint32_t*)(cval + SFOFF); + free(item); + } + } + + if (OFFNULL(ooff)) { + FOFF soffs[SFOFFNUM]; + FOFF *soff = soffs; + char sbuf[SBUFSIZE]; + CDBREC *rrec = (CDBREC*)sbuf; + + int retnum; + if ((retnum = cdb_getoff(db, hash, &soff, CDB_LOCKED)) < 0) { + cdb_lock_unlock(db->mlock[lockid]); + return -1; + } + + for(int i = 0; i < retnum; i++) { + /* check for duplicate records/older version*/ + int cret; + if (rrec != (CDBREC*)sbuf) { + free(rrec); + rrec = (CDBREC*)sbuf; + } + + struct timespec ts; + _cdb_timerreset(&ts); + cret = db->vio->rrec(db->vio, &rrec, soff[i], false); + db->rcount++; + db->rtime += _cdb_timermicrosec(&ts); + + if (cret < 0) + continue; + + if (ksize == rrec->ksize && memcmp(rrec->key, key, ksize) == 0) { + /* got its old meta info */ + rec.osize = rrec->osize; + rec.ooff = rrec->ooff; + ooff = rec.ooff; + break; + } + } + if (soff != soffs) + free(soff); + if (rrec != (CDBREC*)sbuf) + free(rrec); + } + + if (OFFNOTNULL(ooff)) { + cdb_updatepage(db, hash, ooff, CDB_PAGEDELETEOFF, CDB_LOCKED); + cdb_lock_unlock(db->mlock[lockid]); + + struct timespec ts; + _cdb_timerreset(&ts); + if (db->vio->drec(db->vio, &rec, ooff) < 0) + ; // return -1; succeed or not doesn't matter + db->wcount++; + db->wtime += _cdb_timermicrosec(&ts); + cdb_seterrno(db, CDB_SUCCESS, __FILE__, __LINE__); + return 0; + } else { + cdb_lock_unlock(db->mlock[lockid]); + cdb_seterrno(db, CDB_NOTFOUND, __FILE__, __LINE__); + return -3; + } +} + + +void cdb_stat(CDB *db, CDBSTAT *stat) +{ + if (stat == NULL) { + db->rchit = db->rcmiss = 0; + db->pchit = db->pcmiss = 0; + db->rcount = db->rtime = 0; + db->wcount = db->wtime = 0; + } else { + stat->rnum = db->rnum; + stat->rcnum = db->rcache? db->rcache->num : 0; + stat->pnum = db->hsize; + stat->pcnum = (db->pcache? db->pcache->num : 0) + + (db->dpcache? db->dpcache->num : 0); + stat->rchit = db->rchit; + stat->rcmiss = db->rcmiss; + stat->pchit = db->pchit; + stat->pcmiss = db->pcmiss; + stat->rlatcy = db->rcount ? db->rtime / db->rcount : 0; + stat->wlatcy = db->wcount ? db->wtime / db->wcount : 0; + } +} + + +int cdb_close(CDB *db) +{ + if (!db->opened) + return -1; + + if (db->bgtask) + cdb_bgtask_stop(db->bgtask); + if (db->rcache) + cdb_ht_destroy(db->rcache); + if (db->pcache) + cdb_ht_destroy(db->pcache); + if (db->dpcache) { + cdb_flushalldpage(db); + cdb_ht_destroy(db->dpcache); + } + + if (db->vio) { + db->vio->whead(db->vio); + db->vio->close(db->vio); + cdb_vio_destroy(db->vio); + } + if (db->mtable) + free(db->mtable); + db->opened = false; + _cdb_defparam(db); + return 0; +} + + +void cdb_deferrorcb(void *arg, int errno, const char *file, int line) +{ + fprintf(stderr, "DBERR: [%s:%d] %d - %s\n", file, line, errno, cdb_errmsg(errno)); +} + + +int cdb_destroy(CDB *db) +{ + if (db->opened) + cdb_close(db); + for(int i = 0; i < MLOCKNUM; i++) + cdb_lock_destory(db->mlock[i]); + cdb_lock_destory(db->dpclock); + cdb_lock_destory(db->pclock); + cdb_lock_destory(db->rclock); + cdb_lock_destory(db->stlock); + cdb_lock_destory(db->oidlock); + cdb_lock_destory(db->bflock); + cdb_bgtask_destroy(db->bgtask); + pthread_key_delete(*(pthread_key_t*)db->errkey); + free(db->errkey); + free(db); + return 0; +} + + + diff --git a/libdap-cuttdb/src/cdb_core.h b/libdap-cuttdb/src/cdb_core.h new file mode 100644 index 0000000000000000000000000000000000000000..bcd4ad9a6bef0d43993781f3707ed0f44935cae2 --- /dev/null +++ b/libdap-cuttdb/src/cdb_core.h @@ -0,0 +1,122 @@ +/* + * CuttDB - a fast key-value storage engine + * + * + * http://code.google.com/p/cuttdb/ + * + * Copyright (c) 2012, Siyuan Fu. All rights reserved. + * Use and distribution licensed under the BSD license. + * See the LICENSE file for full text + * + * Author: Siyuan Fu <fusiyuan2010@gmail.com> + * + */ + + +#ifndef _CDB_CORE_H_ +#define _CDB_CORE_H_ +#include "cuttdb.h" +#include "cdb_types.h" +#include "cdb_hashtable.h" +#include "cdb_bloomfilter.h" +#include "cdb_lock.h" +#include "cdb_vio.h" +#include "cdb_bgtask.h" +#include <stdint.h> +#include <stdbool.h> + + +enum { + CDB_PAGEDELETEOFF = 0, + CDB_PAGEINSERTOFF = 1, +}; + +/* the DB object */ +struct CDB +{ + /* size limit for record cache */ + uint64_t rclimit; + /* size limit for index page cache */ + uint64_t pclimit; + /* size of bloom filter */ + uint64_t bfsize; + /* record number in db */ + uint64_t rnum; + /* always increment operation id */ + uint64_t oid; + /* recovery point oid */ + uint64_t roid; + /* hash table size */ + uint32_t hsize; + /* last timestamp of no dirty page state */ + uint32_t ndpltime; + /* currently the database opened or not */ + bool opened; + /* the size for a disk seek&read, should not greater than SBUFSIZE */ + uint32_t areadsize; + + /* record cache */ + CDBHASHTABLE *rcache; + /* (clean) index page cache */ + CDBHASHTABLE *pcache; + /* dirty index page cache */ + CDBHASHTABLE *dpcache; + /* Bloom Filter */ + CDBBLOOMFILTER *bf; + + /* lock for rcache */ + CDBLOCK *rclock; + /* lock for pcache */ + CDBLOCK *pclock; + /* lock for dpcache */ + CDBLOCK *dpclock; + /* lock for hash table operation, split to MLOCKNUM groups */ + CDBLOCK *mlock[MLOCKNUM]; + /* lock for statistic */ + CDBLOCK *stlock; + /* lock for operation id */ + CDBLOCK *oidlock; + /* lock for bloom filter */ + CDBLOCK *bflock; + /* background tasks in another thread */ + CDBBGTASK *bgtask; + + /* main hash table, contains 'hsize' elements */ + FOFF *mtable; + /* disk i/o layer object */ + CDBVIO *vio; + + /* callback function when error occurs */ + CDB_ERRCALLBACK errcb; + /* argument for callback function */ + void *errcbarg; + /* key to get error code in current thread */ + void *errkey; + + /* statistics below, this fields have no lock protection */ + /* record cache hit/miss */ + uint64_t rchit; + uint64_t rcmiss; + /* page cache hit/miss */ + uint64_t pchit; + uint64_t pcmiss; + /* cumulative disk read time */ + uint64_t rtime; + /* number of disk read operation */ + uint64_t rcount; + /* cumulative disk write time */ + uint64_t wtime; + /* number of disk write operation */ + uint64_t wcount; +}; + + +bool cdb_checkoff(CDB *db, uint64_t hash, FOFF off, int locked); +int cdb_getoff(CDB *db, uint64_t hash, FOFF **offs, int locked); +int cdb_replaceoff(CDB *db, uint64_t hash, FOFF off, FOFF noff, int locked); +int cdb_updatepage(CDB *db, uint64_t hash, FOFF off, int opt, int locked); +void cdb_flushalldpage(CDB *db); +uint64_t cdb_genoid(CDB *db); + +#endif + diff --git a/libdap-cuttdb/src/cdb_crc64.c b/libdap-cuttdb/src/cdb_crc64.c new file mode 100644 index 0000000000000000000000000000000000000000..6c72eb73fb3b8aaf774cd0e87479fc0fe82c580b --- /dev/null +++ b/libdap-cuttdb/src/cdb_crc64.c @@ -0,0 +1,170 @@ +/* + * CuttDB - a fast key-value storage engine + * + * + * http://code.google.com/p/cuttdb/ + * + * Copyright (c) 2012, Siyuan Fu. All rights reserved. + * Use and distribution licensed under the BSD license. + * See the LICENSE file for full text + * + * Author: Siyuan Fu <fusiyuan2010@gmail.com> + * + */ + + +/************************************************************** +* * +* Fichier : crc64.c * +* Fonction pour calculer le CRC64 * +* * +**************************************************************/ +#include "cdb_crc64.h" + + +#define CONST64(n) (n##ULL) +static uint64_t CRC64_Table[256] = +{ + CONST64(0x0000000000000000), CONST64(0x42f0e1eba9ea3693), + CONST64(0x85e1c3d753d46d26), CONST64(0xc711223cfa3e5bb5), + CONST64(0x493366450e42ecdf), CONST64(0x0bc387aea7a8da4c), + CONST64(0xccd2a5925d9681f9), CONST64(0x8e224479f47cb76a), + CONST64(0x9266cc8a1c85d9be), CONST64(0xd0962d61b56fef2d), + CONST64(0x17870f5d4f51b498), CONST64(0x5577eeb6e6bb820b), + CONST64(0xdb55aacf12c73561), CONST64(0x99a54b24bb2d03f2), + CONST64(0x5eb4691841135847), CONST64(0x1c4488f3e8f96ed4), + CONST64(0x663d78ff90e185ef), CONST64(0x24cd9914390bb37c), + CONST64(0xe3dcbb28c335e8c9), CONST64(0xa12c5ac36adfde5a), + CONST64(0x2f0e1eba9ea36930), CONST64(0x6dfeff5137495fa3), + CONST64(0xaaefdd6dcd770416), CONST64(0xe81f3c86649d3285), + CONST64(0xf45bb4758c645c51), CONST64(0xb6ab559e258e6ac2), + CONST64(0x71ba77a2dfb03177), CONST64(0x334a9649765a07e4), + CONST64(0xbd68d2308226b08e), CONST64(0xff9833db2bcc861d), + CONST64(0x388911e7d1f2dda8), CONST64(0x7a79f00c7818eb3b), + CONST64(0xcc7af1ff21c30bde), CONST64(0x8e8a101488293d4d), + CONST64(0x499b3228721766f8), CONST64(0x0b6bd3c3dbfd506b), + CONST64(0x854997ba2f81e701), CONST64(0xc7b97651866bd192), + CONST64(0x00a8546d7c558a27), CONST64(0x4258b586d5bfbcb4), + CONST64(0x5e1c3d753d46d260), CONST64(0x1cecdc9e94ace4f3), + CONST64(0xdbfdfea26e92bf46), CONST64(0x990d1f49c77889d5), + CONST64(0x172f5b3033043ebf), CONST64(0x55dfbadb9aee082c), + CONST64(0x92ce98e760d05399), CONST64(0xd03e790cc93a650a), + CONST64(0xaa478900b1228e31), CONST64(0xe8b768eb18c8b8a2), + CONST64(0x2fa64ad7e2f6e317), CONST64(0x6d56ab3c4b1cd584), + CONST64(0xe374ef45bf6062ee), CONST64(0xa1840eae168a547d), + CONST64(0x66952c92ecb40fc8), CONST64(0x2465cd79455e395b), + CONST64(0x3821458aada7578f), CONST64(0x7ad1a461044d611c), + CONST64(0xbdc0865dfe733aa9), CONST64(0xff3067b657990c3a), + CONST64(0x711223cfa3e5bb50), CONST64(0x33e2c2240a0f8dc3), + CONST64(0xf4f3e018f031d676), CONST64(0xb60301f359dbe0e5), + CONST64(0xda050215ea6c212f), CONST64(0x98f5e3fe438617bc), + CONST64(0x5fe4c1c2b9b84c09), CONST64(0x1d14202910527a9a), + CONST64(0x93366450e42ecdf0), CONST64(0xd1c685bb4dc4fb63), + CONST64(0x16d7a787b7faa0d6), CONST64(0x5427466c1e109645), + CONST64(0x4863ce9ff6e9f891), CONST64(0x0a932f745f03ce02), + CONST64(0xcd820d48a53d95b7), CONST64(0x8f72eca30cd7a324), + CONST64(0x0150a8daf8ab144e), CONST64(0x43a04931514122dd), + CONST64(0x84b16b0dab7f7968), CONST64(0xc6418ae602954ffb), + CONST64(0xbc387aea7a8da4c0), CONST64(0xfec89b01d3679253), + CONST64(0x39d9b93d2959c9e6), CONST64(0x7b2958d680b3ff75), + CONST64(0xf50b1caf74cf481f), CONST64(0xb7fbfd44dd257e8c), + CONST64(0x70eadf78271b2539), CONST64(0x321a3e938ef113aa), + CONST64(0x2e5eb66066087d7e), CONST64(0x6cae578bcfe24bed), + CONST64(0xabbf75b735dc1058), CONST64(0xe94f945c9c3626cb), + CONST64(0x676dd025684a91a1), CONST64(0x259d31cec1a0a732), + CONST64(0xe28c13f23b9efc87), CONST64(0xa07cf2199274ca14), + CONST64(0x167ff3eacbaf2af1), CONST64(0x548f120162451c62), + CONST64(0x939e303d987b47d7), CONST64(0xd16ed1d631917144), + CONST64(0x5f4c95afc5edc62e), CONST64(0x1dbc74446c07f0bd), + CONST64(0xdaad56789639ab08), CONST64(0x985db7933fd39d9b), + CONST64(0x84193f60d72af34f), CONST64(0xc6e9de8b7ec0c5dc), + CONST64(0x01f8fcb784fe9e69), CONST64(0x43081d5c2d14a8fa), + CONST64(0xcd2a5925d9681f90), CONST64(0x8fdab8ce70822903), + CONST64(0x48cb9af28abc72b6), CONST64(0x0a3b7b1923564425), + CONST64(0x70428b155b4eaf1e), CONST64(0x32b26afef2a4998d), + CONST64(0xf5a348c2089ac238), CONST64(0xb753a929a170f4ab), + CONST64(0x3971ed50550c43c1), CONST64(0x7b810cbbfce67552), + CONST64(0xbc902e8706d82ee7), CONST64(0xfe60cf6caf321874), + CONST64(0xe224479f47cb76a0), CONST64(0xa0d4a674ee214033), + CONST64(0x67c58448141f1b86), CONST64(0x253565a3bdf52d15), + CONST64(0xab1721da49899a7f), CONST64(0xe9e7c031e063acec), + CONST64(0x2ef6e20d1a5df759), CONST64(0x6c0603e6b3b7c1ca), + CONST64(0xf6fae5c07d3274cd), CONST64(0xb40a042bd4d8425e), + CONST64(0x731b26172ee619eb), CONST64(0x31ebc7fc870c2f78), + CONST64(0xbfc9838573709812), CONST64(0xfd39626eda9aae81), + CONST64(0x3a28405220a4f534), CONST64(0x78d8a1b9894ec3a7), + CONST64(0x649c294a61b7ad73), CONST64(0x266cc8a1c85d9be0), + CONST64(0xe17dea9d3263c055), CONST64(0xa38d0b769b89f6c6), + CONST64(0x2daf4f0f6ff541ac), CONST64(0x6f5faee4c61f773f), + CONST64(0xa84e8cd83c212c8a), CONST64(0xeabe6d3395cb1a19), + CONST64(0x90c79d3fedd3f122), CONST64(0xd2377cd44439c7b1), + CONST64(0x15265ee8be079c04), CONST64(0x57d6bf0317edaa97), + CONST64(0xd9f4fb7ae3911dfd), CONST64(0x9b041a914a7b2b6e), + CONST64(0x5c1538adb04570db), CONST64(0x1ee5d94619af4648), + CONST64(0x02a151b5f156289c), CONST64(0x4051b05e58bc1e0f), + CONST64(0x87409262a28245ba), CONST64(0xc5b073890b687329), + CONST64(0x4b9237f0ff14c443), CONST64(0x0962d61b56fef2d0), + CONST64(0xce73f427acc0a965), CONST64(0x8c8315cc052a9ff6), + CONST64(0x3a80143f5cf17f13), CONST64(0x7870f5d4f51b4980), + CONST64(0xbf61d7e80f251235), CONST64(0xfd913603a6cf24a6), + CONST64(0x73b3727a52b393cc), CONST64(0x31439391fb59a55f), + CONST64(0xf652b1ad0167feea), CONST64(0xb4a25046a88dc879), + CONST64(0xa8e6d8b54074a6ad), CONST64(0xea16395ee99e903e), + CONST64(0x2d071b6213a0cb8b), CONST64(0x6ff7fa89ba4afd18), + CONST64(0xe1d5bef04e364a72), CONST64(0xa3255f1be7dc7ce1), + CONST64(0x64347d271de22754), CONST64(0x26c49cccb40811c7), + CONST64(0x5cbd6cc0cc10fafc), CONST64(0x1e4d8d2b65facc6f), + CONST64(0xd95caf179fc497da), CONST64(0x9bac4efc362ea149), + CONST64(0x158e0a85c2521623), CONST64(0x577eeb6e6bb820b0), + CONST64(0x906fc95291867b05), CONST64(0xd29f28b9386c4d96), + CONST64(0xcedba04ad0952342), CONST64(0x8c2b41a1797f15d1), + CONST64(0x4b3a639d83414e64), CONST64(0x09ca82762aab78f7), + CONST64(0x87e8c60fded7cf9d), CONST64(0xc51827e4773df90e), + CONST64(0x020905d88d03a2bb), CONST64(0x40f9e43324e99428), + CONST64(0x2cffe7d5975e55e2), CONST64(0x6e0f063e3eb46371), + CONST64(0xa91e2402c48a38c4), CONST64(0xebeec5e96d600e57), + CONST64(0x65cc8190991cb93d), CONST64(0x273c607b30f68fae), + CONST64(0xe02d4247cac8d41b), CONST64(0xa2dda3ac6322e288), + CONST64(0xbe992b5f8bdb8c5c), CONST64(0xfc69cab42231bacf), + CONST64(0x3b78e888d80fe17a), CONST64(0x7988096371e5d7e9), + CONST64(0xf7aa4d1a85996083), CONST64(0xb55aacf12c735610), + CONST64(0x724b8ecdd64d0da5), CONST64(0x30bb6f267fa73b36), + CONST64(0x4ac29f2a07bfd00d), CONST64(0x08327ec1ae55e69e), + CONST64(0xcf235cfd546bbd2b), CONST64(0x8dd3bd16fd818bb8), + CONST64(0x03f1f96f09fd3cd2), CONST64(0x41011884a0170a41), + CONST64(0x86103ab85a2951f4), CONST64(0xc4e0db53f3c36767), + CONST64(0xd8a453a01b3a09b3), CONST64(0x9a54b24bb2d03f20), + CONST64(0x5d45907748ee6495), CONST64(0x1fb5719ce1045206), + CONST64(0x919735e51578e56c), CONST64(0xd367d40ebc92d3ff), + CONST64(0x1476f63246ac884a), CONST64(0x568617d9ef46bed9), + CONST64(0xe085162ab69d5e3c), CONST64(0xa275f7c11f7768af), + CONST64(0x6564d5fde549331a), CONST64(0x279434164ca30589), + CONST64(0xa9b6706fb8dfb2e3), CONST64(0xeb46918411358470), + CONST64(0x2c57b3b8eb0bdfc5), CONST64(0x6ea7525342e1e956), + CONST64(0x72e3daa0aa188782), CONST64(0x30133b4b03f2b111), + CONST64(0xf7021977f9cceaa4), CONST64(0xb5f2f89c5026dc37), + CONST64(0x3bd0bce5a45a6b5d), CONST64(0x79205d0e0db05dce), + CONST64(0xbe317f32f78e067b), CONST64(0xfcc19ed95e6430e8), + CONST64(0x86b86ed5267cdbd3), CONST64(0xc4488f3e8f96ed40), + CONST64(0x0359ad0275a8b6f5), CONST64(0x41a94ce9dc428066), + CONST64(0xcf8b0890283e370c), CONST64(0x8d7be97b81d4019f), + CONST64(0x4a6acb477bea5a2a), CONST64(0x089a2aacd2006cb9), + CONST64(0x14dea25f3af9026d), CONST64(0x562e43b4931334fe), + CONST64(0x913f6188692d6f4b), CONST64(0xd3cf8063c0c759d8), + CONST64(0x5dedc41a34bbeeb2), CONST64(0x1f1d25f19d51d821), + CONST64(0xd80c07cd676f8394), CONST64(0x9afce626ce85b507) +}; + + +uint64_t cdb_crc64(const void *buf, uint32_t len) +{ + uint32_t i; + uint64_t crc = 0xFFFFFFFFFFFFFFFF; + uint8_t *cbuf = (uint8_t *)buf; + + for (i = 0; i < len; i++) { + crc = CRC64_Table[(uint8_t)(crc >> 56) ^ *cbuf++] ^ (crc << 8); + } + return crc; +} + diff --git a/libdap-cuttdb/src/cdb_crc64.h b/libdap-cuttdb/src/cdb_crc64.h new file mode 100644 index 0000000000000000000000000000000000000000..50744fc844afe84cdcef8ddba5f6cff81ae5599a --- /dev/null +++ b/libdap-cuttdb/src/cdb_crc64.h @@ -0,0 +1,22 @@ +/* + * CuttDB - a fast key-value storage engine + * + * + * http://code.google.com/p/cuttdb/ + * + * Copyright (c) 2012, Siyuan Fu. All rights reserved. + * Use and distribution licensed under the BSD license. + * See the LICENSE file for full text + * + * Author: Siyuan Fu <fusiyuan2010@gmail.com> + * + */ + + +#ifndef _CDB_CRC64_H_ +#define _CDB_CRC64_H_ +#include <stdint.h> + +uint64_t cdb_crc64(const void *buf, uint32_t len); + +#endif diff --git a/libdap-cuttdb/src/cdb_dumpdb.c b/libdap-cuttdb/src/cdb_dumpdb.c new file mode 100644 index 0000000000000000000000000000000000000000..99cddbb5355cb44d235ef08b27234a7199c2fe67 --- /dev/null +++ b/libdap-cuttdb/src/cdb_dumpdb.c @@ -0,0 +1,68 @@ +/* + * CuttDB - a fast key-value storage engine + * + * + * http://code.google.com/p/cuttdb/ + * + * Copyright (c) 2012, Siyuan Fu. All rights reserved. + * Use and distribution licensed under the BSD license. + * See the LICENSE file for full text + * + * Author: Siyuan Fu <fusiyuan2010@gmail.com> + * + */ + +#include "cuttdb.h" +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <stdint.h> +#include <stdbool.h> + + +bool itcb(void *arg, const char *key, int ksize, const char *val, int vsize, uint32_t expire, uint64_t oid) +{ +#define SBUFSIZE 4096 + char buf[SBUFSIZE]; + char *kvbuf = buf; + if (ksize + vsize + 2 > SBUFSIZE) + kvbuf = (char*)malloc(ksize + vsize + 2); + memcpy(kvbuf, key, ksize); + kvbuf[ksize] = '\t'; + memcpy(kvbuf + ksize + 1, val, vsize); + kvbuf[ksize + vsize + 1] = '\0'; + printf("%s\t%u\n", kvbuf, expire); + if (kvbuf != buf) + free(kvbuf); + return true; +} + +int main(int argc, char *argv[]) +{ + /* 1TB */ + int cache_limit = 1048576; + + if (argc < 2) { + fprintf(stderr, "Usage: %s dbpath [cachelimit(MB)].... \n", argv[0]); + return -1; + } + if (argc > 2) { + cache_limit = atoi(argv[2]); + } + + CDB *db = cdb_new(); + cdb_option(db, 0, 0, cache_limit); + if (cdb_open(db, argv[1], CDB_PAGEWARMUP) < 0) { + fprintf(stderr, "Database open error, unable to recovery\n"); + return -1; + } + void *it = cdb_iterate_new(db, 0); + cdb_iterate(db, itcb, NULL, it); + cdb_iterate_destroy(db, it); + cdb_destroy(db); +} + + + + + diff --git a/libdap-cuttdb/src/cdb_dumpraw.c b/libdap-cuttdb/src/cdb_dumpraw.c new file mode 100644 index 0000000000000000000000000000000000000000..53bbe11c6e7bec1723c97fb951b63468889d555a --- /dev/null +++ b/libdap-cuttdb/src/cdb_dumpraw.c @@ -0,0 +1,115 @@ +/* + * CuttDB - a fast key-value storage engine + * + * + * http://code.google.com/p/cuttdb/ + * + * Copyright (c) 2012, Siyuan Fu. All rights reserved. + * Use and distribution licensed under the BSD license. + * See the LICENSE file for full text + * + * Author: Siyuan Fu <fusiyuan2010@gmail.com> + * + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <unistd.h> +#include <fcntl.h> +#include <sys/mman.h> +#include <stdint.h> + +#define SI4 4 +#define SI8 8 + +/* data record */ +typedef struct { + /* disk store starts at following field */ + uint32_t magic; + uint32_t ksize; + uint32_t vsize; + uint32_t expire; + uint64_t oid; + char buf[0]; +} __attribute__((packed)) CDBREC; + +/* real size of a record header when stored on disk */ +#define RECHSIZE (SI4 * 4 + SI8) +/* real size of a record when stored on disk */ +#define RECSIZE(r) (RECHSIZE + (r)->ksize + (r)->vsize) + +#define FILEMETASIZE 64 +#define ALIGNBYTES 16 +#define RECMAGIC 0x19871022 +#define DELRECMAGIC 0x19871023 +#define FILEMAGICHEADER "CuTtDbFiLePaRtIaL" +#define FILEMAGICLEN (strlen(FILEMAGICHEADER)) +#define OFFALIGNED(off) (((off) & (ALIGNBYTES - 1))? ((off) | (ALIGNBYTES - 1)) + 1: off) + + + +void process(const char *filename) +{ +#define SBUFSIZE 4096 + int fd = open(filename, O_RDONLY, 0644); + char buf[SBUFSIZE]; + if (fd < 0) + fprintf(stderr, "%s Open failed\n", filename); + + long filesize = lseek(fd, 0, SEEK_END); + long pos = FILEMETASIZE; + char *map = (char*)mmap(NULL, filesize, PROT_READ, MAP_SHARED, fd, 0); + if (memcmp(map, FILEMAGICHEADER, FILEMAGICLEN)) { + fprintf(stderr, "%s is not a cuttdb file\n", filename); + close(fd); + return; + } + + while(pos < filesize) { + char *kvbuf = buf; + CDBREC *rec = (CDBREC*)&map[pos]; + if (rec->magic != RECMAGIC && rec->magic != DELRECMAGIC) { + pos += ALIGNBYTES; + continue; + } + + pos += OFFALIGNED(RECSIZE(rec)); + if (rec->magic != RECMAGIC) + continue; + + if (rec->ksize + rec->vsize + 2 > SBUFSIZE) { + kvbuf = (char*)malloc(rec->ksize + rec->vsize + 2); + } + memcpy(kvbuf, rec->buf, rec->ksize); + kvbuf[rec->ksize] = '\t'; + memcpy(kvbuf + rec->ksize + 1, rec->buf + rec->ksize, rec->vsize); + kvbuf[rec->ksize + rec->vsize + 1] = '\0'; + printf("%s\t%u\n", kvbuf, rec->expire); + if (kvbuf != buf) + free(kvbuf); + } + + munmap(map, filesize); + close(fd); +} + + + + +int main(int argc, char *argv[]) +{ + if (argc < 2) { + fprintf(stderr, "Usage: %s dat########.cdb dat########.cdb .... \n", argv[0]); + return 0; + } + for(int i = 1; i < argc; i++) + process(argv[i]); + return 0; +} + + + + diff --git a/libdap-cuttdb/src/cdb_errno.c b/libdap-cuttdb/src/cdb_errno.c new file mode 100644 index 0000000000000000000000000000000000000000..432d154ac4dba8d8a3879b16905ae69468c58094 --- /dev/null +++ b/libdap-cuttdb/src/cdb_errno.c @@ -0,0 +1,78 @@ +/* + * CuttDB - a fast key-value storage engine + * + * + * http://code.google.com/p/cuttdb/ + * + * Copyright (c) 2012, Siyuan Fu. All rights reserved. + * Use and distribution licensed under the BSD license. + * See the LICENSE file for full text + * + * Author: Siyuan Fu <fusiyuan2010@gmail.com> + * + */ + + +#include "cuttdb.h" +#include "cdb_errno.h" +#include "cdb_types.h" +#include "cdb_core.h" +#include <pthread.h> + + +int cdb_errno(CDB *db) +{ + return (long)pthread_getspecific(*(pthread_key_t*)db->errkey); +} + +const char *cdb_errmsg(int ecode) +{ + switch(ecode) { + case CDB_SUCCESS: + return "Success"; + case CDB_NOTFOUND: + return "Key Not Found"; + case CDB_EXIST: + return "Item Already Exists"; + case CDB_DIRNOEXIST: + return "Path Open Failed"; + case CDB_OPENERR: + return "File Open Failed"; + case CDB_PIDEXIST: + return "Opened By Another Process"; + case CDB_DATAERRDAT: + return "Data File Content Error"; + case CDB_DATAERRIDX: + return "Index File Content Error"; + case CDB_WRITEERR: + return "Write To File Error"; + case CDB_READERR: + return "Read From File Error"; + case CDB_NOFID: + return "Internal File Lost"; + case CDB_INTERNALERR: + return "Internal Error"; + case CDB_DATAERRMETA: + return "File Header Error"; + case CDB_MEMDBNOCACHE: + return "MemDB Mode With Zero Record Cache Size"; + default: + return "Error For Errno"; + } +} + + +void cdb_seterrcb(CDB *db, CDB_ERRCALLBACK errcb, void *arg) +{ + db->errcb = errcb; + db->errcbarg = arg; +} + + +void cdb_seterrno(CDB *db, int ecode, const char *source, int line) +{ + pthread_setspecific(*(pthread_key_t*)db->errkey, (void*)(long)ecode); + if (ecode != CDB_SUCCESS && db->errcb) { + db->errcb(db->errcbarg, ecode, source, line); + } +} diff --git a/libdap-cuttdb/src/cdb_errno.h b/libdap-cuttdb/src/cdb_errno.h new file mode 100644 index 0000000000000000000000000000000000000000..f274819de73b2133d2648aa6490ea8f5cf66b41c --- /dev/null +++ b/libdap-cuttdb/src/cdb_errno.h @@ -0,0 +1,22 @@ +/* + * CuttDB - a fast key-value storage engine + * + * + * http://code.google.com/p/cuttdb/ + * + * Copyright (c) 2012, Siyuan Fu. All rights reserved. + * Use and distribution licensed under the BSD license. + * See the LICENSE file for full text + * + * Author: Siyuan Fu <fusiyuan2010@gmail.com> + * + */ + + +#ifndef _CDB_ERRNO_H_ +#define _CDB_ERRNO_H_ + +void cdb_seterrno(CDB *db, int ecode, const char *source, int line); + +#endif + diff --git a/libdap-cuttdb/src/cdb_hashtable.c b/libdap-cuttdb/src/cdb_hashtable.c new file mode 100644 index 0000000000000000000000000000000000000000..f8746a681197799f797a18789b41387a77a6bc83 --- /dev/null +++ b/libdap-cuttdb/src/cdb_hashtable.c @@ -0,0 +1,539 @@ +/* + * CuttDB - a fast key-value storage engine + * + * + * http://code.google.com/p/cuttdb/ + * + * Copyright (c) 2012, Siyuan Fu. All rights reserved. + * Use and distribution licensed under the BSD license. + * See the LICENSE file for full text + * + * Author: Siyuan Fu <fusiyuan2010@gmail.com> + * + */ + + +#include "cdb_hashtable.h" +#include <stdlib.h> +#include <string.h> + +/* +#define LRUPREV(i) (*(CDBHTITEM**)&((i)->buf[0])) +#define LRUNEXT(i) (*(CDBHTITEM**)&((i)->buf[sizeof(void*)])) +*/ + +#define LRUPREV(i) ((i)->lruptr[0]) +#define LRUNEXT(i) ((i)->lruptr[1]) + +static uint32_t MurmurHash1( const void * key, int len) +{ + const unsigned int m = 0xc6a4a793; + const int r = 16; + unsigned int h = 0x19900917 ^ (len * m); + const unsigned char * data = (const unsigned char *)key; + + while(len >= 4) + { + unsigned int k = *(unsigned int *)data; + h += k; h *= m; h ^= h >> 16; + data += 4; len -= 4; + } + + switch(len) + { + case 3: + h += data[2] << 16; + case 2: + h += data[1] << 8; + case 1: + h += data[0]; + h *= m; + h ^= h >> r; + }; + + h *= m; h ^= h >> 10; + h *= m; h ^= h >> 17; + return h; +} + +void *cdb_ht_itemkey(CDBHASHTABLE *ht, CDBHTITEM *item) +{ + return (void *)(item->buf + ht->lru * 2 * sizeof(void*)); +} + +void *cdb_ht_itemval(CDBHASHTABLE *ht, CDBHTITEM *item) +{ + return (void *)(item->buf + ht->lru * 2 * sizeof(void*) + item->ksize); +} + +CDBHASHTABLE *cdb_ht_new(bool lru, CDBHASHFUNC hashfunc) +{ + CDBHASHTABLE *ht; + + ht = (CDBHASHTABLE*)malloc(sizeof(CDBHASHTABLE)); + ht->hash = NULL; + ht->lru = lru; + ht->num = ht->size = 0; + ht->tail = ht->head = NULL; + for(uint32_t i = 0; i < (1<<CDBHTBNUMPOW); i++) { + CDBHTBUCKET *bucket = &(ht->buckets[i]); + bucket->bnum = 2; + uint32_t lsize = sizeof(CDBHTITEM *) * bucket->bnum; + bucket->rnum = 0; + bucket->items = (CDBHTITEM **)malloc(lsize); + ht->size += lsize; + memset(bucket->items, 0, lsize); + } + ht->hash = hashfunc; + if (ht->hash == NULL) + ht->hash = MurmurHash1; + + ht->size += sizeof(CDBHASHTABLE); + + return ht; +} + +CDBHTITEM *cdb_ht_newitem(CDBHASHTABLE *ht, int ksize, int vsize) +{ + CDBHTITEM *item; + int hsize; + + if (ht->lru) + hsize = sizeof(CDBHTITEM) + 2 * sizeof(void*); + else + hsize = sizeof(CDBHTITEM); + + item = (CDBHTITEM*)malloc(hsize + ksize + vsize); + item->ksize = ksize; + item->vsize = vsize; + if (ht->lru) { + LRUPREV(item) = NULL; + LRUNEXT(item) = NULL; + } + return item; +} + + + + +void cdb_ht_insert(CDBHASHTABLE *ht, CDBHTITEM *item) +{ + uint32_t bid, hid; + CDBHTBUCKET *bucket; + + item->hash = ht->hash(cdb_ht_itemkey(ht, item), item->ksize); + bid = item->hash & ((1<<CDBHTBNUMPOW)-1); + bucket = &(ht->buckets[bid]); + hid = (item->hash >> CDBHTBNUMPOW) & (bucket->bnum-1); + + if (bucket->rnum > bucket->bnum * 2) { + CDBHTITEM **ilist; + uint32_t exp = 2; + if (bucket->bnum < 512) + exp = 4; + int listsize = (bucket->bnum * exp) * sizeof(CDBHTITEM*); + ilist = (CDBHTITEM**)malloc(listsize); + memset(ilist, 0, listsize); + for(uint32_t i = 0; i < bucket->bnum; i++) { + CDBHTITEM *curitem = bucket->items[i]; + while(curitem != NULL) { + CDBHTITEM *nextitem = curitem->hnext; + uint32_t hid = (curitem->hash>>CDBHTBNUMPOW) + & (bucket->bnum * exp - 1); + curitem->hnext = ilist[hid]; + ilist[hid] = curitem; + curitem = nextitem; + } + } + free(bucket->items); + bucket->items = ilist; + ht->size += listsize - bucket->bnum * sizeof(CDBHTITEM *); + bucket->bnum *= exp; + hid = (item->hash >> CDBHTBNUMPOW) & (bucket->bnum - 1); + } + + { + CDBHTITEM *curitem = bucket->items[hid]; + CDBHTITEM *preitem = NULL; + while(curitem != NULL) { + if (curitem->hash == item->hash + && curitem->ksize == item->ksize + && memcmp(cdb_ht_itemkey(ht, curitem), + cdb_ht_itemkey(ht, item) ,curitem->ksize) == 0) { + CDBHTITEM *tmp; + if (ht->lru) { + if (LRUPREV(curitem)) + LRUNEXT(LRUPREV(curitem)) = LRUNEXT(curitem); + if (LRUNEXT(curitem)) + LRUPREV(LRUNEXT(curitem)) = LRUPREV(curitem); + if (ht->head == curitem) + ht->head = LRUNEXT(curitem); + if (ht->tail == curitem) + ht->tail = LRUPREV(curitem); + } + if (preitem) + preitem->hnext = curitem->hnext; + else + bucket->items[hid] = curitem->hnext; + tmp = curitem->hnext; + ht->size -= sizeof(CDBHTITEM) + curitem->ksize + curitem->vsize + + (ht->lru > 0) * sizeof(CDBHTITEM*) * 2; + ht->num--; + bucket->rnum--; + free(curitem); + curitem = tmp; + break; + } + preitem = curitem; + curitem = curitem->hnext; + } + } + + item->hnext = bucket->items[hid]; + bucket->items[hid] = item; + + if (ht->lru) { + if (ht->head) LRUPREV(ht->head) = item; + LRUPREV(item) = NULL; + LRUNEXT(item) = ht->head; + ht->head = item; + if (ht->tail == NULL) + ht->tail = item; + } + + bucket->rnum++; + ht->num++; + ht->size += sizeof(CDBHTITEM) + item->ksize + item->vsize + + ht->lru * sizeof(CDBHTITEM*) * 2; +} + + +void *cdb_ht_insert2(CDBHASHTABLE *ht, const void *key, int ksize, const void *val, int vsize) +{ + CDBHTITEM *item; + + item = cdb_ht_newitem(ht, ksize, vsize); + memcpy(cdb_ht_itemkey(ht, item), key, ksize); + memcpy(cdb_ht_itemval(ht, item), val, vsize); + cdb_ht_insert(ht, item); + return cdb_ht_itemval(ht, item); +} + +void *cdb_ht_get(CDBHASHTABLE *ht, const void *key, int ksize, int *vsize, bool mtf) +{ + CDBHTITEM *res; + + res = cdb_ht_get3(ht, key, ksize, mtf); + if (res) { + *vsize = res->vsize; + return cdb_ht_itemval(ht, res); + } else { + *vsize = 0; + return NULL; + } +} + + +void *cdb_ht_get2(CDBHASHTABLE *ht, const void *key, int ksize, bool mtf) +{ + CDBHTITEM *res; + + res = cdb_ht_get3(ht, key, ksize, mtf); + if (res) + return cdb_ht_itemval(ht, res); + else + return NULL; +} + + +CDBHTITEM *cdb_ht_get3(CDBHASHTABLE *ht, const void *key, int ksize, bool mtf) +{ + uint32_t hash, bid, hid; + CDBHTBUCKET *bucket; + CDBHTITEM *curitem; + + hash = ht->hash(key, ksize); + bid = hash & ((1<<CDBHTBNUMPOW)-1); + bucket = &(ht->buckets[bid]); + hid = (hash >> CDBHTBNUMPOW) & (bucket->bnum - 1); + + curitem = bucket->items[hid]; + while (curitem != NULL) { + if (curitem->hash == hash + && curitem->ksize == ksize + && memcmp(cdb_ht_itemkey(ht, curitem), key , ksize) == 0) { + if (ht->lru && mtf && ht->head != curitem) { + if (LRUPREV(curitem)) + LRUNEXT(LRUPREV(curitem)) = LRUNEXT(curitem); + if (LRUNEXT(curitem)) + LRUPREV(LRUNEXT(curitem)) = LRUPREV(curitem); + if (ht->tail == curitem) + ht->tail = LRUPREV(curitem); + + LRUNEXT(curitem) = ht->head; + LRUPREV(ht->head) = curitem; + ht->head = curitem; + LRUPREV(curitem) = NULL; + } + return curitem; + } + curitem = curitem->hnext; + } + return NULL; +} + + +bool cdb_ht_exist(CDBHASHTABLE *ht, const void *key, int ksize) +{ + int vsize; + return (cdb_ht_get(ht, key, ksize, &vsize, false) != NULL); +} + + +int cdb_ht_del2(CDBHASHTABLE *ht, const void *key, int ksize) +{ + CDBHTITEM *res = NULL; + res = cdb_ht_del(ht, key, ksize); + if (res) { + free(res); + return 0; + } + return -1; +} + + +CDBHTITEM *cdb_ht_del(CDBHASHTABLE *ht, const void *key, int ksize) +{ + uint32_t hash, bid, hid; + CDBHTBUCKET *bucket; + CDBHTITEM *curitem, *preitem; + CDBHTITEM *res = NULL; + + hash = ht->hash(key, ksize); + bid = hash & ((1<<CDBHTBNUMPOW)-1); + bucket = &(ht->buckets[bid]); + hid = (hash >> CDBHTBNUMPOW) & (bucket->bnum - 1); + + curitem = bucket->items[hid]; + preitem = NULL; + while(curitem != NULL) { + if (curitem->hash == hash + && curitem->ksize == ksize + && memcmp(cdb_ht_itemkey(ht, curitem), + key, ksize) == 0) { + if (ht->lru) { + if (LRUPREV(curitem)) + LRUNEXT(LRUPREV(curitem)) = LRUNEXT(curitem); + if (LRUNEXT(curitem)) + LRUPREV(LRUNEXT(curitem)) = LRUPREV(curitem); + if (ht->head == curitem) + ht->head = LRUNEXT(curitem); + if (ht->tail == curitem) + ht->tail = LRUPREV(curitem); + } + if (preitem) + preitem->hnext = curitem->hnext; + else + bucket->items[hid] = curitem->hnext; + ht->size -= sizeof(CDBHTITEM) + curitem->ksize + curitem->vsize + + (ht->lru > 0) * sizeof(CDBHTITEM*) * 2; + ht->num--; + bucket->rnum--; + res = curitem; + curitem = curitem->hnext; + break; + } + preitem = curitem; + curitem = curitem->hnext; + } + + return res; +} + + +void cdb_ht_removetail(CDBHASHTABLE *ht) +{ + CDBHTITEM *item; + + item = cdb_ht_poptail(ht); + if (item) + free(item); + return; +} + + +CDBHTITEM *cdb_ht_gettail(CDBHASHTABLE *ht) +{ + return ht->tail; +} + + +CDBHTITEM *cdb_ht_poptail(CDBHASHTABLE *ht) +{ + CDBHTITEM *item = ht->tail, *curitem, *preitem;; + CDBHTBUCKET *bucket; + uint32_t bid, hid; + + if (!(ht->lru) || item == NULL) + return NULL; + + bid = item->hash & ((1<<CDBHTBNUMPOW)-1); + bucket = &(ht->buckets[bid]); + hid = (item->hash >> CDBHTBNUMPOW) & (bucket->bnum - 1); + + curitem = bucket->items[hid]; + preitem = NULL; + while (curitem != NULL) { + if (curitem->hash == item->hash + && curitem->ksize == item->ksize + && memcmp(cdb_ht_itemkey(ht, curitem), + cdb_ht_itemkey(ht, item), item->ksize) == 0) { + if (preitem) { + preitem->hnext = curitem->hnext; + } else { + bucket->items[hid] = curitem->hnext; + } + break; + } + preitem = curitem; + curitem = curitem->hnext; + } + + if (LRUPREV(item)) + LRUNEXT(LRUPREV(item)) = NULL; + if (ht->head == item) + ht->head = NULL; + ht->tail = LRUPREV(item); + bucket->rnum--; + ht->num--; + ht->size -= sizeof(CDBHTITEM) + item->ksize + item->vsize + + sizeof(CDBHTITEM*) * 2; + return item; +} + +void cdb_ht_clean(CDBHASHTABLE *ht) +{ + for(uint32_t i = 0; i < (1<<CDBHTBNUMPOW); i++) { + CDBHTBUCKET *bucket = &(ht->buckets[i]); + for(uint32_t j = 0; j < bucket->bnum; j++) { + CDBHTITEM *curitem = bucket->items[j]; + while(curitem != NULL) { + CDBHTITEM *tmp = curitem->hnext; + free(curitem); + curitem = tmp; + } + bucket->items[j] = NULL; + } + bucket->rnum = 0; + } + ht->num = 0; +} + + +void cdb_ht_destroy(CDBHASHTABLE *ht) +{ + if (ht->lru) { + CDBHTITEM *curitem = ht->head; + while(curitem) { + CDBHTITEM *nextitem = LRUNEXT(curitem); + free(curitem); + curitem = nextitem; + } + } + + for(uint32_t i = 0; i < (1<<CDBHTBNUMPOW); i++) { + CDBHTBUCKET *bucket = &(ht->buckets[i]); + + for(uint32_t j = 0; j < bucket->bnum && (!ht->lru); j++) { + CDBHTITEM *curitem = bucket->items[j]; + while(curitem != NULL) { + CDBHTITEM *tmp = curitem->hnext; + free(curitem); + curitem = tmp; + } + } + free(bucket->items); + } + free(ht); +} + + +CDBHTITEM *cdb_ht_iterbegin(CDBHASHTABLE *ht) +{ + for(uint32_t i = 0; i < (1<<CDBHTBNUMPOW); i++) { + CDBHTBUCKET *bucket = &(ht->buckets[i]); + if (!bucket->rnum) + continue; + for(uint32_t j = 0; j < bucket->bnum; j++) + if (bucket->items[j]) + return bucket->items[j]; + } + + return NULL; +} + + +CDBHTITEM *cdb_ht_iternext(CDBHASHTABLE *ht, CDBHTITEM *cur) +{ + if (cur == NULL) + return NULL; + + if (cur->hnext) + return cur->hnext; + + uint32_t bid = cur->hash & ((1<<CDBHTBNUMPOW)-1); + CDBHTBUCKET *bucket = &(ht->buckets[bid]); + uint32_t hid = (cur->hash >> CDBHTBNUMPOW) & (bucket->bnum - 1); + + for(uint32_t i = hid + 1; i < bucket->bnum; i++) { + if (bucket->items[i]) + return bucket->items[i]; + } + + for(uint32_t i = bid + 1; i < (1<<CDBHTBNUMPOW); i++) { + CDBHTBUCKET *bucket = &(ht->buckets[i]); + if (!bucket->rnum) + continue; + for(int j = 0; j < bucket->bnum; j++) + if (bucket->items[j]) + return bucket->items[j]; + } + + return NULL; +} + + +#ifdef _UT_ +#include <stdio.h> +#include <time.h> +int main(int argc, char *argv[]) +{ + CDBHASHTABLE *ht; + long k, v; + ht = cdb_ht_new(true, NULL); + for(int i = 0; i < 1000; i++) { + k = i; + v = i * 1000; + cdb_ht_insert2(ht, &k, sizeof(long), &v, sizeof(long)); + } + + srand(time(NULL)); + + for(int i = 0; i < 1000; i++) { + long *v, k = rand() % 1000; + int vsize; + v = (long*)cdb_ht_get(ht, &k, sizeof(long), &vsize, true); + printf("get: %ld -> %ld (%d)\n", k, *v, vsize); + } + + printf("total size: %d num: %d\n", ht->size, ht->num); + + CDBHTITEM *item; + item = cdb_ht_poptail(ht); + printf("tail: %ld - %ld\n", *(long*)cdb_ht_itemkey(ht, item), *(long*)cdb_ht_itemval(ht, item)); + free(item); + item = cdb_ht_poptail(ht); + printf("tail: %ld - %ld\n", *(long*)cdb_ht_itemkey(ht, item), *(long*)cdb_ht_itemval(ht, item)); + free(item); +} +#endif diff --git a/libdap-cuttdb/src/cdb_hashtable.h b/libdap-cuttdb/src/cdb_hashtable.h new file mode 100644 index 0000000000000000000000000000000000000000..1f35b376dae7dd7618c24500dc729ab71577ad45 --- /dev/null +++ b/libdap-cuttdb/src/cdb_hashtable.h @@ -0,0 +1,139 @@ +/* + * CuttDB - a fast key-value storage engine + * + * + * http://code.google.com/p/cuttdb/ + * + * Copyright (c) 2012, Siyuan Fu. All rights reserved. + * Use and distribution licensed under the BSD license. + * See the LICENSE file for full text + * + * Author: Siyuan Fu <fusiyuan2010@gmail.com> + * + */ + + +#ifndef _CDB_HASHTABLE_H_ +#define _CDB_HASHTABLE_H_ +#include <stdint.h> +#include <stdlib.h> +#include <stdbool.h> + +#if defined(__cplusplus) +extern "C" { +#endif + +typedef uint32_t (*CDBHASHFUNC)(const void *, int); + +/* default 1<<8 level-1 buckets, which makes the table expanding more smoothly */ +#define CDBHTBNUMPOW 8 + + +typedef struct CDBHTITEM +{ + int ksize; + int vsize; + uint32_t hash; + /* next element with the same hash */ + struct CDBHTITEM *hnext; + /* if LRU is true, the first several bytes are two pointers of prev/next element */ + struct CDBHTITEM *lruptr[0]; + char buf[0]; +} __attribute__((packed)) CDBHTITEM; + + +typedef struct { + /* array for items */ + CDBHTITEM **items; + /* number of allocated slots in the bucket */ + uint32_t bnum; + /* number of items exist in the bucket */ + uint32_t rnum; +} CDBHTBUCKET; + + +typedef struct CDBHASHTABLE { + /* is in LRU mode? */ + bool lru; + /* user specified hash function */ + CDBHASHFUNC hash; + /* fixed number for level-1 buckets */ + CDBHTBUCKET buckets[1<<CDBHTBNUMPOW]; + /* memory usage */ + uint64_t size; + /* number of items */ + uint64_t num; + /* in LRU mode, the newest item */ + CDBHTITEM *head; + /* in LRU mode, the oldest item */ + CDBHTITEM *tail; +} CDBHASHTABLE; + + +/* get the pointer of key in current item */ +/* #define cdb_ht_itemkey(ht, item) (item->buf + ht->lru * 2 * sizeof(void*)) */ +void *cdb_ht_itemkey(CDBHASHTABLE *ht, CDBHTITEM *item); + +/* get the pointer of value in current item */ +/* #define cdb_ht_itemval(ht, item) (item->buf + ht->lru * 2 * sizeof(void*) + item->ksize) */ +void *cdb_ht_itemval(CDBHASHTABLE *ht, CDBHTITEM *item); + +/* create an hashtable, it can be a simple hashtable or with LeastRecentUse + The LRU mode needs extra two pointer space for every element + hash function can by specified by user */ +CDBHASHTABLE *cdb_ht_new(bool lru, CDBHASHFUNC hashfunc); + +/* clean and free the hastable */ +void cdb_ht_destroy(CDBHASHTABLE *ht); + +/* allocate a new item with specified size, but do not insert it into table */ +CDBHTITEM *cdb_ht_newitem(CDBHASHTABLE *ht, int ksize, int vsize); + +/* insert an item which already exists into table */ +void cdb_ht_insert(CDBHASHTABLE *ht, CDBHTITEM *item); + +/* allocate and insert an item into table by key and value, return the pointer of value in table */ +void *cdb_ht_insert2(CDBHASHTABLE *ht, const void *key, int ksize, const void *val, int vsize); + +/* get the value of an item and its size in table, move the item to front if mtf == true */ +void *cdb_ht_get(CDBHASHTABLE *ht, const void *key, int ksize, int *vsize, bool mtf); + +/* get the value of an item, assume the size is known, move the item to front if mtf == true */ +void *cdb_ht_get2(CDBHASHTABLE *ht, const void *key, int ksize, bool mtf); + +/* get the pointer of an item, it hasn't been copied */ +CDBHTITEM *cdb_ht_get3(CDBHASHTABLE *ht, const void *key, int ksize, bool mtf); + +/* check if an item with the key exists */ +bool cdb_ht_exist(CDBHASHTABLE *ht, const void *key, int ksize); + +/* delete and free an item from table by its key */ +int cdb_ht_del2(CDBHASHTABLE *ht, const void *key, int ksize); + +/* return and delete an item from table, the item should be freed by user */ +CDBHTITEM *cdb_ht_del(CDBHASHTABLE *ht, const void *key, int ksize); + +/* delete and free the last item in table */ +void cdb_ht_removetail(CDBHASHTABLE *ht); + +/* return last item in table, do not delete nor free */ +CDBHTITEM *cdb_ht_gettail(CDBHASHTABLE *ht); + +/* return last item in table, delete but should be freed by user */ +CDBHTITEM *cdb_ht_poptail(CDBHASHTABLE *ht); + +/* clean and free all elements in the table*/ +void cdb_ht_clean(CDBHASHTABLE *ht); + +/* iterate the table by get the front one firstly */ +CDBHTITEM *cdb_ht_iterbegin(CDBHASHTABLE *ht); + +/* get the next item of current element */ +CDBHTITEM *cdb_ht_iternext(CDBHASHTABLE *ht, CDBHTITEM *cur); + +#if defined(__cplusplus) +} +#endif + +#endif + diff --git a/libdap-cuttdb/src/cdb_lock.c b/libdap-cuttdb/src/cdb_lock.c new file mode 100644 index 0000000000000000000000000000000000000000..54b91071cba0e9ac124c616a54b93fdbe2e29894 --- /dev/null +++ b/libdap-cuttdb/src/cdb_lock.c @@ -0,0 +1,75 @@ +/* + * CuttDB - a fast key-value storage engine + * + * + * http://code.google.com/p/cuttdb/ + * + * Copyright (c) 2012, Siyuan Fu. All rights reserved. + * Use and distribution licensed under the BSD license. + * See the LICENSE file for full text + * + * Author: Siyuan Fu <fusiyuan2010@gmail.com> + * + */ + + +#include "cdb_lock.h" +#include <stdlib.h> +#include <pthread.h> +#include <sched.h> + + +CDBLOCK *cdb_lock_new(int ltype) +{ + CDBLOCK *lock = NULL; + if (ltype == CDB_LOCKSPIN) { + lock = (CDBLOCK *)malloc(sizeof(CDBLOCK) + sizeof(pthread_spinlock_t)); + pthread_spin_init((pthread_spinlock_t*)&lock->lock, PTHREAD_PROCESS_PRIVATE); + } else if (ltype == CDB_LOCKMUTEX) { + lock = (CDBLOCK *)malloc(sizeof(CDBLOCK) + sizeof(pthread_mutex_t)); + pthread_mutex_init((pthread_mutex_t*)&lock->lock, NULL); + } + lock->ltype = ltype; + + return lock; +} + + +void cdb_lock_lock(CDBLOCK *lock) +{ + if (lock->ltype == CDB_LOCKSPIN) + pthread_spin_lock((pthread_spinlock_t*)&lock->lock); + else if (lock->ltype == CDB_LOCKMUTEX) + pthread_mutex_lock((pthread_mutex_t*)&lock->lock); +} + + +void cdb_lock_unlock(CDBLOCK *lock) +{ + if (lock->ltype == CDB_LOCKSPIN) + pthread_spin_unlock((pthread_spinlock_t*)&lock->lock); + else if (lock->ltype == CDB_LOCKMUTEX) + pthread_mutex_unlock((pthread_mutex_t*)&lock->lock); +} + + +void cdb_lock_destory(CDBLOCK *lock) +{ + if (lock->ltype == CDB_LOCKSPIN) + pthread_spin_destroy((pthread_spinlock_t*)&lock->lock); + else if (lock->ltype == CDB_LOCKMUTEX) + pthread_mutex_destroy((pthread_mutex_t*)&lock->lock); + + free(lock); +} + + +int cdb_lock_trylock(CDBLOCK *lock) +{ + if (lock->ltype == CDB_LOCKSPIN) + return pthread_spin_trylock((pthread_spinlock_t*)&lock->lock); + else if (lock->ltype == CDB_LOCKMUTEX) + return pthread_mutex_trylock((pthread_mutex_t*)&lock->lock); + return 0; +} + diff --git a/libdap-cuttdb/src/cdb_lock.h b/libdap-cuttdb/src/cdb_lock.h new file mode 100644 index 0000000000000000000000000000000000000000..587fcdb18b40722da27f0eebff9fdb0e05934ce3 --- /dev/null +++ b/libdap-cuttdb/src/cdb_lock.h @@ -0,0 +1,49 @@ +/* + * CuttDB - a fast key-value storage engine + * + * + * http://code.google.com/p/cuttdb/ + * + * Copyright (c) 2012, Siyuan Fu. All rights reserved. + * Use and distribution licensed under the BSD license. + * See the LICENSE file for full text + * + * Author: Siyuan Fu <fusiyuan2010@gmail.com> + * + */ + + +#ifndef _CDB_LOCK_H_ +#define _CDB_LOCK_H_ + + +enum { + /* spinlock */ + CDB_LOCKSPIN, + /* mutex, which may cause OS context switch, mainly used in where Disk IO happens */ + CDB_LOCKMUTEX, +}; + +/* may be used to indicated whether the area is protected */ +enum { + CDB_LOCKED, + CDB_NOTLOCKED, +}; + +typedef struct CDBLOCK +{ + int ltype; + char lock[0]; +} CDBLOCK; + + +CDBLOCK *cdb_lock_new(int ltype); +void cdb_lock_lock(CDBLOCK *lock); +void cdb_lock_unlock(CDBLOCK *lock); +void cdb_lock_destory(CDBLOCK *lock); +int cdb_lock_trylock(CDBLOCK *lock); + + + +#endif + diff --git a/libdap-cuttdb/src/cdb_types.h b/libdap-cuttdb/src/cdb_types.h new file mode 100644 index 0000000000000000000000000000000000000000..cfb6e6b8c7b7be2940d25e4ff9f8c098d3bc48c4 --- /dev/null +++ b/libdap-cuttdb/src/cdb_types.h @@ -0,0 +1,144 @@ +/* + * CuttDB - a fast key-value storage engine + * + * + * http://code.google.com/p/cuttdb/ + * + * Copyright (c) 2012, Siyuan Fu. All rights reserved. + * Use and distribution licensed under the BSD license. + * See the LICENSE file for full text + * + * Author: Siyuan Fu <fusiyuan2010@gmail.com> + * + */ + + +#ifndef _CDB_TYPES_H_ +#define _CDB_TYPES_H_ +#include <stdint.h> + +#define KB 1024 +#define MB 1048576 +#define CDBMIN(a, b) ((a)<(b)?(a):(b)) +#define CDBMAX(a, b) ((a)>(b)?(a):(b)) + +#define SI8 8 +#define SI4 4 +/* space reserved in stack for i/o, avoid some malloc/free */ +#define SBUFSIZE (64 * KB) + +/* a default disk read size for index page, 3KB is enough(a page with 300 items) */ +#define PAGEAREADSIZE (3 * KB) + +/* reserved in stack for matched items in a hash index page */ +#define SFOFFNUM 8 + +/* a valid virtual offset */ +#define OFFNOTNULL(o) (((o).i4)||((o).i2)) +/* a null virtual offset */ +#define OFFNULL(o) (((o).i4==0)&&((o).i2==0)) +/* nullify an offset */ +#define OFFZERO(o) do{(o).i4=0;(o).i2=0;}while(0) +/* offset is equal ? */ +#define OFFEQ(a,b) (((a).i4==(b).i4)&&((a).i2==(b).i2)) +/* hash in page is equal ? */ +#define PHASHEQ(a,b) (((a).i2==(b).i2)&&((a).i1==(b).i1)) +/* page size increment */ +#define CDB_PAGEINCR 4 + + +/* if page cache size exceeds the limit */ +#define PCOVERFLOW(db) ((db)->dpcache && (db)->dpcache->size + (db)->pcache->size > (db)->pclimit) +/* if record cache size exceeds the limit */ +#define RCOVERFLOW(db) ((db)->rcache && (db)->rcache->size > (db)->rclimit) + +/* timeout for a dirty index page stays since last modify */ +#define DPAGETIMEOUT 40 +/* operation on main table are isolated by these locks */ +#define MLOCKNUM 256 + +#define CDBHASH64(a, b) cdb_crc64(a, b) + +/* all virtual offsets are 48-bits */ +typedef struct FOFF +{ + uint32_t i4; + uint16_t i2; +} __attribute__((packed)) FOFF; + + + +#define SFOFF (sizeof(FOFF)) + + +/* all hash value in index page are 24-bits + range 0..16M guarantee very low collision + with less than a hundred records in a page */ +typedef struct PHASH +{ + uint16_t i2; + uint8_t i1; +} __attribute__((packed)) PHASH; + + +/* an item in index page contains a hash and an offset */ +typedef struct PITEM +{ + FOFF off; + PHASH hash; +} __attribute__((packed)) PITEM; + + +/* data record */ +typedef struct CDBREC{ + /* where the data come from */ + FOFF ooff; + uint32_t osize; + + /* access convenient*/ + void *key; + void *val; + + /* disk store starts at following field */ + uint32_t magic; + uint32_t ksize; + uint32_t vsize; + uint32_t expire; + uint64_t oid; + char buf[0]; +} __attribute__((packed)) CDBREC; + +/* real size of a record header when stored on disk */ +#define RECHSIZE (SI4 * 4 + SI8) +/* real size of a record when stored on disk */ +#define RECSIZE(r) (RECHSIZE + (r)->ksize + (r)->vsize) + + +/* index page */ +typedef struct CDBPAGE{ + FOFF ooff; + uint32_t osize; + uint32_t cap; + + union { + /* what it be on disk */ + uint32_t magic; + /* what it be in memory */ + uint32_t mtime; + }; + /* which bucket it belongs to */ + uint32_t bid; + uint32_t num; + uint64_t oid; + PITEM items[0]; +} __attribute__((packed)) CDBPAGE; + +/* real size of a page header when stored on disk */ +#define PAGEHSIZE (SI4 * 3 + SI8) +/* real size of a page when stored on disk */ +#define PAGESIZE(p) (PAGEHSIZE + sizeof(PITEM) * (p)->num) +/* in-memory size of an record structure */ +#define MPAGESIZE(p) (sizeof(CDBPAGE) + sizeof(PITEM) * (p)->cap) + +#endif + diff --git a/libdap-cuttdb/src/cdb_vio.c b/libdap-cuttdb/src/cdb_vio.c new file mode 100644 index 0000000000000000000000000000000000000000..c0da6d1572812cbda4afe795cf24e25a733e375b --- /dev/null +++ b/libdap-cuttdb/src/cdb_vio.c @@ -0,0 +1,42 @@ +/* + * CuttDB - a fast key-value storage engine + * + * + * http://code.google.com/p/cuttdb/ + * + * Copyright (c) 2012, Siyuan Fu. All rights reserved. + * Use and distribution licensed under the BSD license. + * See the LICENSE file for full text + * + * Author: Siyuan Fu <fusiyuan2010@gmail.com> + * + */ + + +#include "cdb_vio.h" +#include "cdb_types.h" +#include "vio_apnd2.h" +#include "stdlib.h" + + +CDBVIO *cdb_vio_new(int type) +{ + CDBVIO *res; + res = (CDBVIO *)malloc(sizeof(CDBVIO)); + switch(type) { + case CDBVIOAPND2: + vio_apnd2_init(res); + break; + default: + vio_apnd2_init(res); + break; + } + return res; +} + +int cdb_vio_destroy(CDBVIO *vio) +{ + free(vio); + return 0; +} + diff --git a/libdap-cuttdb/src/cdb_vio.h b/libdap-cuttdb/src/cdb_vio.h new file mode 100644 index 0000000000000000000000000000000000000000..5c6e7e205813f11d84d45f0fa8916cff1edc3bbe --- /dev/null +++ b/libdap-cuttdb/src/cdb_vio.h @@ -0,0 +1,101 @@ +/* + * CuttDB - a fast key-value storage engine + * + * + * http://code.google.com/p/cuttdb/ + * + * Copyright (c) 2012, Siyuan Fu. All rights reserved. + * Use and distribution licensed under the BSD license. + * See the LICENSE file for full text + * + * Author: Siyuan Fu <fusiyuan2010@gmail.com> + * + */ + + +#ifndef _CDB_VIO_H_ +#define _CDB_VIO_H_ +#include "cdb_types.h" +#include "cuttdb.h" +#include <stdint.h> +#include <stdbool.h> + +enum { + /* obsoleted */ + CDBVIOAPPEND, + /* append only format storage */ + CDBVIOAPND2, +}; + +typedef struct CDBVIO CDBVIO; + +/* write a record, returns virtual offset at 3rd parameter */ +typedef int (*VIOWRITEREC)(CDBVIO*, CDBREC*, FOFF*); +/* delete a record, pass in the current offset at 3rd parameter */ +typedef int (*VIODELETEREC)(CDBVIO*, CDBREC*, FOFF); +/* read a record, 2nd parameter default points to stack buffer, if its real size +greater than the stack buffer size, it will be changed to points to a space in heap, +the last parameter decides whether read the whole record or just read key for comparsion */ +typedef int (*VIOREADREC)(CDBVIO*, CDBREC**, FOFF, bool); +/* close the storage */ +typedef int (*VIOCLOSE)(CDBVIO*); +/* open the storage, pass in the storage path and open mode */ +typedef int (*VIOOPEN)(CDBVIO*, const char*, int); +/* write an index page, return its virtual offset at 3rd parameter */ +typedef int (*VIOWRITEPAGE)(CDBVIO*, CDBPAGE *, FOFF*); +/* read an index page, 2nd parameter default points to stack buffer, if its real size +greater than the stack buffer size, it will be changed to points to a space in heap */ +typedef int (*VIOREADPAGE)(CDBVIO*, CDBPAGE **, FOFF); +/* make the storage do an sync operation */ +typedef int (*VIOSYNC)(CDBVIO*); +/* write db header, which contains main-index */ +typedef int (*VIOWRITEHEAD)(CDBVIO*); +/* read db header, which contains main-index */ +typedef int (*VIOREADHEAD)(CDBVIO*); +/* tell that no dirty page exists */ +typedef void (*VIOCLEANPOINT)(CDBVIO*); +/* get the record/page iterator at oid */ +typedef void* (*VIOITFIRST)(CDBVIO *, uint64_t oid); +/* get the next index page by iterator */ +typedef int (*VIOPAGEITNEXT)(CDBVIO *, CDBPAGE **, void *); +/* get the next record by iterator */ +typedef int (*VIORECITNEXT)(CDBVIO *, CDBREC **, void *); +/* destroy and free the iterator */ +typedef void (*VIOITDESTROY)(CDBVIO *, void *); + +struct CDBVIO +{ + VIOOPEN open; + VIOCLOSE close; + + VIOWRITEREC wrec; + VIODELETEREC drec; + VIOREADREC rrec; + + VIOWRITEPAGE wpage; + VIOREADPAGE rpage; + + VIOSYNC sync; + VIOWRITEHEAD whead; + VIOREADHEAD rhead; + + VIOCLEANPOINT cleanpoint; + + VIOITFIRST pageitfirst; + VIOPAGEITNEXT pageitnext; + VIOITDESTROY pageitdestroy; + + VIOITFIRST recitfirst; + VIORECITNEXT recitnext; + VIOITDESTROY recitdestroy; + + CDB *db; + void *iometa; +}; + + +CDBVIO *cdb_vio_new(int type); +int cdb_vio_destroy(CDBVIO *vio); + + +#endif diff --git a/libdap-cuttdb/src/cuttdb-server.c b/libdap-cuttdb/src/cuttdb-server.c new file mode 100644 index 0000000000000000000000000000000000000000..9b09a2863a28eeb88b74eebd1f23ebf6f6bfaa71 --- /dev/null +++ b/libdap-cuttdb/src/cuttdb-server.c @@ -0,0 +1,2152 @@ +/* + * CuttDB - a fast key-value storage engine + * + * + * http://code.google.com/p/cuttdb/ + * + * The server&network part of CuttDB is based on Beansdb: + * + * http://beansdb.googlecode.com + * + * Beansdb is most based on Memcachedb and Memcached: + * + * http://memcachedb.org/ + * http://danga.com/memcached/ + * + * Copyright (c) 2012, Siyuan Fu. All rights reserved. + * Use and distribution licensed under the BSD license. + * See the LICENSE file for full text + * + * Author: Siyuan Fu <fusiyuan2010@gmail.com> + * + */ + + +#include "cuttdb-server.h" +#include "cuttdb.h" +#include <sys/stat.h> +#include <sys/socket.h> +#include <sys/un.h> +#include <signal.h> +#include <sys/resource.h> +#include <sys/uio.h> +#include <unistd.h> + +/* need this to get IOV_MAX on some platforms. */ +#ifndef __need_IOV_MAX +#define __need_IOV_MAX +#endif +#include <pwd.h> +#include <sys/mman.h> +#include <fcntl.h> +#include <netinet/tcp.h> +#include <arpa/inet.h> +#include <errno.h> +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <time.h> +#include <assert.h> +#include <limits.h> +#include <inttypes.h> +#include <ctype.h> + + +#ifdef HAVE_READPROC +#include <proc/readproc.h> +#endif + +#ifdef HAVE_MALLOC_H +/* OpenBSD has a malloc.h, but warns to use stdlib.h instead */ +#ifndef __OpenBSD__ +#include <malloc.h> +#endif +#endif + +/* FreeBSD 4.x doesn't have IOV_MAX exposed. */ +#ifndef IOV_MAX +#if defined(__FreeBSD__) || defined(__APPLE__) +# define IOV_MAX 1024 +#endif +#endif + +#ifndef IOV_MAX +# define IOV_MAX 1024 +#endif + +#ifndef CLOCK_MONOTONIC +#include "clock_gettime_stub.c" +#endif + +/* + * forward declarations + */ +static int new_socket(struct addrinfo *ai); +static int server_socket(const int port, const bool is_udp); +static int try_read_command(conn *c); +static int try_read_network(conn *c); + +/* stats */ +static void stats_reset(void); +static void stats_init(void); + +/* defaults */ +static void settings_init(void); + +/* event handling, network IO */ +static void conn_close(conn *c); +static void conn_init(void); +static bool update_event(conn *c, const int new_flags); +int delete_event(int fd); +static void complete_nread(conn *c); +static void process_command(conn *c, char *command); +static int transmit(conn *c); +static int ensure_iov_space(conn *c); +static int add_iov(conn *c, const void *buf, int len); +static int add_msghdr(conn *c); +static void conn_free(conn *c); + + +static size_t item_make_header(const uint8_t nkey, const int flags, const int nbytes, + char *suffix, uint8_t *nsuffix); +static int item_free(item *it); +static item *item_get(char *key, size_t nkey); +static item *item_alloc1(char *key, const size_t nkey, const int flags, const int nbytes); + +/** exported globals **/ +struct stats stats; +struct settings settings; + +CDB *db = NULL; +FILE *access_log = NULL; +int daemon_quit = 0; + +/** file scope variables **/ +static int stub_fd = 0; + +#define TRANSMIT_COMPLETE 0 +#define TRANSMIT_INCOMPLETE 1 +#define TRANSMIT_SOFT_ERROR 2 +#define TRANSMIT_HARD_ERROR 3 + + +void item_init(void) { + /*freeitemtotal = INIT_ITEM_FREELIST_LENGTH; + freeitemcurr = 0; + + freeitem = (item **)malloc( sizeof(item *) * freeitemtotal ); + if (freeitem == NULL) { + perror("malloc()"); + }*/ + return; +} + +static size_t item_make_header(const uint8_t nkey, const int flags, const int nbytes, + char *suffix, uint8_t *nsuffix) { + /* suffix is defined at 40 chars elsewhere.. */ + *nsuffix = (uint8_t) snprintf(suffix, 40, " %d %d\r\n", flags, nbytes - 2); + return sizeof(item) + nkey + *nsuffix + nbytes; +} + +static int item_free(item *it) +{ + free(it); + return 0; +} + +static item *item_get(char *key, size_t nkey) +{ + item *it = NULL; + int vlen; + uint32_t flag; + void *value; + int ret = cdb_get(db, key, nkey, &value, &vlen); + flag = 0; + if (ret == 0){ + it = item_alloc1(key, nkey, flag, vlen + 2); + if (it){ + memcpy(ITEM_data(it), value, vlen); + memcpy(ITEM_data(it) + vlen, "\r\n", 2); + } + cdb_free_val(&value); + } + return it; + +} + +static item *item_alloc1(char *key, const size_t nkey, const int flags, const int nbytes) +{ + uint8_t nsuffix; + item *it; + char suffix[40]; + size_t ntotal = item_make_header(nkey + 1, flags, nbytes, suffix, &nsuffix); + + it = (item *)malloc(ntotal); + if (it == NULL){ + return NULL; + } + memset(it, 0, ntotal); + + it->nkey = nkey; + it->nbytes = nbytes; + strcpy(ITEM_key(it), key); + memcpy(ITEM_suffix(it), suffix, (size_t)nsuffix); + it->nsuffix = nsuffix; + return it; +} + + +static void stats_init(void) { + stats.curr_conns = stats.total_conns = stats.conn_structs = 0; + stats.get_cmds = stats.set_cmds = stats.delete_cmds = 0; + stats.slow_cmds = stats.get_hits = stats.get_misses = 0; + stats.bytes_read = stats.bytes_written = 0; + + /* make the time we started always be 2 seconds before we really + did, so time(0) - time.started is never zero. if so, things + like 'settings.oldest_live' which act as booleans as well as + values are now false in boolean context... */ + stats.started = time(0) - 2; +} + +static void stats_reset(void) { + STATS_LOCK(); + stats.total_conns = 0; + stats.get_cmds = stats.set_cmds = stats.delete_cmds = 0; + stats.slow_cmds = stats.get_hits = stats.get_misses = 0; + stats.bytes_read = stats.bytes_written = 0; + STATS_UNLOCK(); +} + +static void settings_init(void) { + settings.port = 8964; + /* By default this string should be NULL for getaddrinfo() */ + settings.inter = NULL; + settings.item_buf_size = 4 * 1024; /* default is 4KB */ + settings.maxconns = 1024; /* to limit connections-related memory to about 5MB */ + settings.verbose = 0; + settings.num_threads = 16; + settings.flush_period = 1; // 1 secs + settings.slow_cmd_time = 0.1; // 100ms +} + +/* + * Adds a message header to a connection. + * + * Returns 0 on success, -1 on out-of-memory. + */ +static int add_msghdr(conn *c) +{ + struct msghdr *msg; + + assert(c != NULL); + + if (c->msgsize == c->msgused) { + msg = realloc(c->msglist, c->msgsize * 2 * sizeof(struct msghdr)); + if (! msg) + return -1; + c->msglist = msg; + c->msgsize *= 2; + } + + msg = c->msglist + c->msgused; + + /* this wipes msg_iovlen, msg_control, msg_controllen, and + msg_flags, the last 3 of which aren't defined on solaris: */ + memset(msg, 0, sizeof(struct msghdr)); + + msg->msg_iov = &c->iov[c->iovused]; + + c->msgbytes = 0; + c->msgused++; + + return 0; +} + + +/* + * Free list management for connections. + */ + +static conn **freeconns; +static int freetotal; +static int freecurr; + + +static void conn_init(void) { + freetotal = 200; + freecurr = 0; + if ((freeconns = (conn **)malloc(sizeof(conn *) * freetotal)) == NULL) { + fprintf(stderr, "malloc()\n"); + } + return; +} + +/* + * Returns a connection from the freelist, if any. Should call this using + * conn_from_freelist() for thread safety. + */ +conn *do_conn_from_freelist() { + conn *c; + + if (freecurr > 0) { + c = freeconns[--freecurr]; + } else { + c = NULL; + } + + return c; +} + +/* + * Adds a connection to the freelist. 0 = success. Should call this using + * conn_add_to_freelist() for thread safety. + */ +bool do_conn_add_to_freelist(conn *c) { + if (freecurr < freetotal) { + freeconns[freecurr++] = c; + return false; + } else { + /* try to enlarge free connections array */ + conn **new_freeconns = realloc(freeconns, sizeof(conn *) * freetotal * 2); + if (new_freeconns) { + freetotal *= 2; + freeconns = new_freeconns; + freeconns[freecurr++] = c; + return false; + } + } + return true; +} + +conn *conn_new(const int sfd, const int init_state, const int read_buffer_size) { + conn *c = conn_from_freelist(); + + if (NULL == c) { + if (!(c = (conn *)calloc(1, sizeof(conn)))) { + fprintf(stderr, "calloc()\n"); + return NULL; + } + c->rbuf = c->wbuf = 0; + c->ilist = 0; + c->iov = 0; + c->msglist = 0; + + c->rsize = read_buffer_size; + c->wsize = DATA_BUFFER_SIZE; + c->isize = ITEM_LIST_INITIAL; + c->iovsize = IOV_LIST_INITIAL; + c->msgsize = MSG_LIST_INITIAL; + + c->rbuf = (char *)malloc((size_t)c->rsize); + c->wbuf = (char *)malloc((size_t)c->wsize); + c->ilist = (item **)malloc(sizeof(item *) * c->isize); + c->iov = (struct iovec *)malloc(sizeof(struct iovec) * c->iovsize); + c->msglist = (struct msghdr *)malloc(sizeof(struct msghdr) * c->msgsize); + + if (c->rbuf == 0 || c->wbuf == 0 || c->ilist == 0 || c->iov == 0 || + c->msglist == 0) { + conn_free(c); + fprintf(stderr, "malloc()\n"); + return NULL; + } + + STATS_LOCK(); + stats.conn_structs++; + STATS_UNLOCK(); + } + + if (settings.verbose > 1) { + if (init_state == conn_listening) + fprintf(stderr, "<%d server listening\n", sfd); + else + fprintf(stderr, "<%d new client connection\n", sfd); + } + + c->sfd = sfd; + c->state = init_state; + c->rlbytes = 0; + c->rbytes = c->wbytes = 0; + c->wcurr = c->wbuf; + c->rcurr = c->rbuf; + c->ritem = 0; + c->icurr = c->ilist; + c->ileft = 0; + c->iovused = 0; + c->msgcurr = 0; + c->msgused = 0; + + c->write_and_go = conn_read; + c->write_and_free = 0; + c->item = 0; + c->noreply = false; + + update_event(c, AE_READABLE); + if (add_event(sfd, AE_READABLE, c) == -1) { + if (conn_add_to_freelist(c)) { + conn_free(c); + } + perror("event_add"); + return NULL; + } + + STATS_LOCK(); + stats.curr_conns++; + stats.total_conns++; + STATS_UNLOCK(); + + return c; +} + +static void conn_cleanup(conn *c) { + assert(c != NULL); + + if (c->item) { + item_free(c->item); + c->item = 0; + } + + if (c->ileft != 0) { + for (; c->ileft > 0; c->ileft--,c->icurr++) { + item_free(*(c->icurr)); + } + } + + if (c->write_and_free) { + free(c->write_and_free); + c->write_and_free = 0; + } +} + +/* + * Frees a connection. + */ +void conn_free(conn *c) { + if (c) { + if (c->msglist) + free(c->msglist); + if (c->rbuf) + free(c->rbuf); + if (c->wbuf) + free(c->wbuf); + if (c->ilist) + free(c->ilist); + if (c->iov) + free(c->iov); + free(c); + } +} + +static void conn_close(conn *c) { + assert(c != NULL); + + if (settings.verbose > 1) + fprintf(stderr, "<%d connection closed.\n", c->sfd); + + delete_event(c->sfd); + close(c->sfd); + c->sfd = -1; + update_event(c, 0); + conn_cleanup(c); + + /* if the connection has big buffers, just free it */ + if (c->rsize > READ_BUFFER_HIGHWAT || conn_add_to_freelist(c)) { + conn_free(c); + } + + STATS_LOCK(); + stats.curr_conns--; + STATS_UNLOCK(); + + return; +} + + +/* + * Shrinks a connection's buffers if they're too big. This prevents + * periodic large "get" requests from permanently chewing lots of server + * memory. + * + * This should only be called in between requests since it can wipe output + * buffers! + */ +static void conn_shrink(conn *c) { + assert(c != NULL); + + if (c->rsize > READ_BUFFER_HIGHWAT && c->rbytes < DATA_BUFFER_SIZE) { + char *newbuf; + + if (c->rcurr != c->rbuf) + memmove(c->rbuf, c->rcurr, (size_t)c->rbytes); + + newbuf = (char *)realloc((void *)c->rbuf, DATA_BUFFER_SIZE); + + if (newbuf) { + c->rbuf = newbuf; + c->rsize = DATA_BUFFER_SIZE; + } + /* TODO check other branch... */ + c->rcurr = c->rbuf; + } + + if (c->isize > ITEM_LIST_HIGHWAT) { + item **newbuf = (item**) realloc((void *)c->ilist, ITEM_LIST_INITIAL * sizeof(c->ilist[0])); + if (newbuf) { + c->ilist = newbuf; + c->isize = ITEM_LIST_INITIAL; + } + /* TODO check error condition? */ + } + + if (c->msgsize > MSG_LIST_HIGHWAT) { + struct msghdr *newbuf = (struct msghdr *) realloc((void *)c->msglist, MSG_LIST_INITIAL * sizeof(c->msglist[0])); + if (newbuf) { + c->msglist = newbuf; + c->msgsize = MSG_LIST_INITIAL; + } + /* TODO check error condition? */ + } + + if (c->iovsize > IOV_LIST_HIGHWAT) { + struct iovec *newbuf = (struct iovec *) realloc((void *)c->iov, IOV_LIST_INITIAL * sizeof(c->iov[0])); + if (newbuf) { + c->iov = newbuf; + c->iovsize = IOV_LIST_INITIAL; + } + /* TODO check return value */ + } +} + +/* + * Sets a connection's current state in the state machine. Any special + * processing that needs to happen on certain state transitions can + * happen here. + */ +static void conn_set_state(conn *c, int state) { + assert(c != NULL); + + if (state != c->state) { + if (state == conn_read) { + conn_shrink(c); + } + c->state = state; + } +} + + +/* + * Ensures that there is room for another struct iovec in a connection's + * iov list. + * + * Returns 0 on success, -1 on out-of-memory. + */ +static int ensure_iov_space(conn *c) { + assert(c != NULL); + + if (c->iovused >= c->iovsize) { + int i, iovnum; + struct iovec *new_iov = (struct iovec *)realloc(c->iov, + (c->iovsize * 2) * sizeof(struct iovec)); + if (! new_iov) + return -1; + c->iov = new_iov; + c->iovsize *= 2; + + /* Point all the msghdr structures at the new list. */ + for (i = 0, iovnum = 0; i < c->msgused; i++) { + c->msglist[i].msg_iov = &c->iov[iovnum]; + iovnum += c->msglist[i].msg_iovlen; + } + } + + return 0; +} + + +/* + * Adds data to the list of pending data that will be written out to a + * connection. + * + * Returns 0 on success, -1 on out-of-memory. + */ + +static int add_iov(conn *c, const void *buf, int len) { + struct msghdr *m; + int leftover; + bool limit_to_mtu; + + assert(c != NULL); + + do { + m = &c->msglist[c->msgused - 1]; + + /* + * Limit the first payloads of TCP replies, to + * MAX_PAYLOAD_SIZE bytes. + */ + limit_to_mtu = (1 == c->msgused); + + /* We may need to start a new msghdr if this one is full. */ + if (m->msg_iovlen == IOV_MAX || + (limit_to_mtu && c->msgbytes >= MAX_PAYLOAD_SIZE)) { + add_msghdr(c); + m = &c->msglist[c->msgused - 1]; + } + + if (ensure_iov_space(c) != 0) + return -1; + + /* If the fragment is too big to fit in the datagram, split it up */ + if (limit_to_mtu && len + c->msgbytes > MAX_PAYLOAD_SIZE) { + leftover = len + c->msgbytes - MAX_PAYLOAD_SIZE; + len -= leftover; + } else { + leftover = 0; + } + + m = &c->msglist[c->msgused - 1]; + m->msg_iov[m->msg_iovlen].iov_base = (void *)buf; + m->msg_iov[m->msg_iovlen].iov_len = len; + + c->msgbytes += len; + c->iovused++; + m->msg_iovlen++; + + buf = ((char *)buf) + len; + len = leftover; + } while (leftover > 0); + + return 0; +} + + +static void out_string(conn *c, const char *str) { + size_t len; + + assert(c != NULL); + + if (c->noreply) { + if (settings.verbose > 1) + fprintf(stderr, ">%d %s\n", c->sfd, str); + c->noreply = false; + conn_set_state(c, conn_read); + return; + } + + len = strlen(str); + if ((len + 2) > c->wsize) { + /* ought to be always enough. just fail for simplicity */ + str = "SERVER_ERROR output line too long"; + len = strlen(str); + } + + memcpy(c->wbuf, str, len); + memcpy(c->wbuf + len, "\r\n", 2); + c->wbytes = len + 2; + c->wcurr = c->wbuf; + + conn_set_state(c, conn_write); + c->write_and_go = conn_read; + return; +} + +/* + * we get here after reading the value in set/add/replace commands. The command + * has been stored in c->item_comm, and the item is ready in c->item. + */ + +static void complete_nread(conn *c) { + assert(c != NULL); + + item *it = c->item; + int comm = c->item_comm; + int ret; + + STATS_LOCK(); + stats.set_cmds++; + STATS_UNLOCK(); + + if (strncmp(ITEM_data(it) + it->nbytes - 2, "\r\n", 2) != 0) { + out_string(c, "CLIENT_ERROR bad data chunk"); + } else { + ret = store_item(it, comm); + if (ret == 0) + out_string(c, "STORED"); + else if(ret == -2) + out_string(c, "EXISTS"); + else if(ret == -3) + out_string(c, "NOT_FOUND"); + else + out_string(c, "NOT_STORED"); + } + + item_free(c->item); + c->item = 0; +} + +/* + * Stores an item in the cache according to the semantics of one of the set + * commands. In threaded mode, this is protected by the cache lock. + * + * Returns true if the item was stored. + */ +int store_item(item *it, int comm) { + char *key = ITEM_key(it); + + switch (comm) { + case NREAD_SET: + return cdb_set2(db, key, it->nkey, ITEM_data(it), it->nbytes - 2, CDB_INSERTCACHE | CDB_OVERWRITE, it->expire); + case NREAD_ADD: + return cdb_set2(db, key, it->nkey, ITEM_data(it), it->nbytes - 2, CDB_INSERTCACHE | CDB_INSERTIFNOEXIST, it->expire); + case NREAD_REPLACE: + return cdb_set2(db, key, it->nkey, ITEM_data(it), it->nbytes - 2, CDB_INSERTCACHE | CDB_INSERTIFEXIST, it->expire); + } + return 0; +} + +/* + * adds a delta value to a numeric item. + */ +/* + +int add_delta(char *key, size_t nkey, int64_t delta, char *buf); +int add_delta(char* key, size_t nkey, int64_t delta, char *buf) { + uint64_t value = hs_incr(db, key, delta); + snprintf(buf, INCR_MAX_STORAGE_LEN, "%llu", (unsigned long long)value); + return 0; +} +*/ + +typedef struct token_s { + char *value; + size_t length; +} token_t; + +#define COMMAND_TOKEN 0 +#define SUBCOMMAND_TOKEN 1 +#define KEY_TOKEN 1 +#define KEY_MAX_LENGTH 250 + +#define MAX_TOKENS 8 + +/* + * Tokenize the command string by replacing whitespace with '\0' and update + * the token array tokens with pointer to start of each token and length. + * Returns total number of tokens. The last valid token is the terminal + * token (value points to the first unprocessed character of the string and + * length zero). + * + * Usage example: + * + * while(tokenize_command(command, ncommand, tokens, max_tokens) > 0) { + * for(int ix = 0; tokens[ix].length != 0; ix++) { + * ... + * } + * ncommand = tokens[ix].value - command; + * command = tokens[ix].value; + * } + */ +static size_t tokenize_command(char *command, token_t *tokens, const size_t max_tokens) { + char *s, *e; + size_t ntokens = 0; + + assert(command != NULL && tokens != NULL && max_tokens > 1); + + for (s = e = command; ntokens < max_tokens - 1; ++e) { + if (*e == ' ') { + if (s != e) { + tokens[ntokens].value = s; + tokens[ntokens].length = e - s; + ntokens++; + *e = '\0'; + } + s = e + 1; + } + else if (*e == '\0') { + if (s != e) { + tokens[ntokens].value = s; + tokens[ntokens].length = e - s; + ntokens++; + } + + break; /* string end */ + } + } + + /* + * If we scanned the whole string, the terminal value pointer is null, + * otherwise it is the first unprocessed character. + */ + tokens[ntokens].value = *e == '\0' ? NULL : e; + tokens[ntokens].length = 0; + ntokens++; + + return ntokens; +} + +static inline bool set_noreply_maybe(conn *c, token_t *tokens, size_t ntokens) +{ + int noreply_index = ntokens - 2; + + /* + NOTE: this function is not the first place where we are going to + send the reply. We could send it instead from process_command() + if the request line has wrong number of tokens. However parsing + malformed line for "noreply" option is not reliable anyway, so + it can't be helped. + */ + if (tokens[noreply_index].value + && strcmp(tokens[noreply_index].value, "noreply") == 0) { + c->noreply = true; + } + return c->noreply; +} + +static void process_stat(conn *c, token_t *tokens, const size_t ntokens) { + time_t now = time(0); + char *command; + char *subcommand; + + assert(c != NULL); + + if(ntokens < 2) { + out_string(c, "CLIENT_ERROR bad command line"); + return; + } + + command = tokens[COMMAND_TOKEN].value; + + if (ntokens == 2 && strcmp(command, "stats") == 0) { + char temp[1024]; + pid_t pid = getpid(); + uint64_t total = 0, curr = 0; + CDBSTAT db_stat; + cdb_stat(db, &db_stat); + total = db_stat.rnum; + char *pos = temp; + +#ifndef WIN32 + struct rusage usage; + getrusage(RUSAGE_SELF, &usage); +#endif /* !WIN32 */ + + STATS_LOCK(); + pos += sprintf(pos, "STAT pid %ld\r\n", (long)pid); + pos += sprintf(pos, "STAT uptime %"PRIuS"\r\n", now - stats.started); + pos += sprintf(pos, "STAT time %"PRIuS"\r\n", now); + pos += sprintf(pos, "STAT version " VERSION "\r\n"); + pos += sprintf(pos, "STAT pointer_size %"PRIuS"\r\n", 8 * sizeof(void *)); +#ifndef WIN32 + pos += sprintf(pos, "STAT rusage_user %ld.%06ld\r\n", usage.ru_utime.tv_sec, usage.ru_utime.tv_usec); + pos += sprintf(pos, "STAT rusage_system %ld.%06ld\r\n", usage.ru_stime.tv_sec, usage.ru_stime.tv_usec); + pos += sprintf(pos, "STAT rusage_minflt %"PRIu64"\r\n", usage.ru_minflt); + pos += sprintf(pos, "STAT rusage_majflt %"PRIu64"\r\n", usage.ru_majflt); + pos += sprintf(pos, "STAT rusage_nswap %"PRIu64"\r\n", usage.ru_nswap); + pos += sprintf(pos, "STAT rusage_inblock %"PRIu64"\r\n", usage.ru_inblock); + pos += sprintf(pos, "STAT rusage_oublock %"PRIu64"\r\n", usage.ru_oublock); + pos += sprintf(pos, "STAT rusage_nvcsw %"PRIu64"\r\n", usage.ru_nvcsw); + pos += sprintf(pos, "STAT rusage_nivcsw %"PRIu64"\r\n", usage.ru_nivcsw); +#endif /* !WIN32 */ +#ifdef HAVE_READPROC + proc_t p; + get_proc_stats(getpid(), &p); + pos += sprintf(pos, "STAT rusage_maxrss %"PRIu64"\r\n", p.vm_rss); +#endif + pos += sprintf(pos, "STAT item_buf_size %"PRIuS"\r\n", settings.item_buf_size); + pos += sprintf(pos, "STAT curr_connections %"PRIu32"\r\n", stats.curr_conns - 1); /* ignore listening conn */ + pos += sprintf(pos, "STAT total_connections %"PRIu32"\r\n", stats.total_conns); + pos += sprintf(pos, "STAT connection_structures %"PRIu32"\r\n", stats.conn_structs); + pos += sprintf(pos, "STAT cmd_get %"PRIu64"\r\n", stats.get_cmds); + pos += sprintf(pos, "STAT cmd_set %"PRIu64"\r\n", stats.set_cmds); + pos += sprintf(pos, "STAT cmd_delete %"PRIu64"\r\n", stats.delete_cmds); + pos += sprintf(pos, "STAT slow_cmd %"PRIu64"\r\n", stats.slow_cmds); + pos += sprintf(pos, "STAT get_hits %"PRIu64"\r\n", stats.get_hits); + pos += sprintf(pos, "STAT get_misses %"PRIu64"\r\n", stats.get_misses); + pos += sprintf(pos, "STAT curr_items %"PRIu64"\r\n", curr); + pos += sprintf(pos, "STAT total_items %"PRIu64"\r\n", total); + pos += sprintf(pos, "STAT bytes_read %"PRIu64"\r\n", stats.bytes_read); + pos += sprintf(pos, "STAT bytes_written %"PRIu64"\r\n", stats.bytes_written); + pos += sprintf(pos, "STAT threads %d\r\n", settings.num_threads); + pos += sprintf(pos, "STAT records_in_cache %lu\r\n", db_stat.rcnum); + pos += sprintf(pos, "STAT pages_total %lu\r\n", db_stat.pnum); + pos += sprintf(pos, "STAT pages_in_cache %lu\r\n", db_stat.pcnum); + pos += sprintf(pos, "STAT record_cache_hits %lu\r\n", db_stat.rchit); + pos += sprintf(pos, "STAT record_cache_misses %lu\r\n", db_stat.rcmiss); + pos += sprintf(pos, "STAT page_cache_hits %lu\r\n", db_stat.pchit); + pos += sprintf(pos, "STAT page_cache_misses %lu\r\n", db_stat.pcmiss); + pos += sprintf(pos, "STAT read_latency_avg %u\r\n", db_stat.rlatcy); + pos += sprintf(pos, "STAT write_latency_avg %u\r\n", db_stat.wlatcy); + pos += sprintf(pos, "END"); + STATS_UNLOCK(); + out_string(c, temp); + return; + } + + subcommand = tokens[SUBCOMMAND_TOKEN].value; + + if (strcmp(subcommand, "reset") == 0) { + stats_reset(); + out_string(c, "RESET"); + return; + } + + out_string(c, "ERROR"); +} + +/* ntokens is overwritten here... shrug.. */ +static inline void process_get_command(conn *c, token_t *tokens, size_t ntokens) { + char *key; + size_t nkey; + int i = 0; + item *it = NULL; + token_t *key_token = &tokens[KEY_TOKEN]; + int stats_get_cmds = 0; + int stats_get_hits = 0; + int stats_get_misses = 0; + assert(c != NULL); + + do { + while(key_token->length != 0) { + + key = key_token->value; + nkey = key_token->length; + + if(nkey > KEY_MAX_LENGTH) { + STATS_LOCK(); + stats.get_cmds += stats_get_cmds; + stats.get_hits += stats_get_hits; + stats.get_misses += stats_get_misses; + STATS_UNLOCK(); + out_string(c, "CLIENT_ERROR bad command line format"); + return; + } + + stats_get_cmds++; + + it = item_get(key, nkey); + + if (it) { + if (i >= c->isize) { + item **new_list = realloc(c->ilist, sizeof(item *) * c->isize * 2); + if (new_list) { + c->isize *= 2; + c->ilist = new_list; + } else { + item_free(it); + it = NULL; + break; + } + } + + /* + * Construct the response. Each hit adds three elements to the + * outgoing data list: + * "VALUE " + * key + * " " + flags + " " + data length + "\r\n" + data (with \r\n) + */ + + if (add_iov(c, "VALUE ", 6) != 0 || + add_iov(c, ITEM_key(it), it->nkey) != 0 || + add_iov(c, ITEM_suffix(it), it->nsuffix + it->nbytes) != 0) + { + item_free(it); + it = NULL; + break; + } + + if (settings.verbose > 1) + fprintf(stderr, ">%d sending key %s\n", c->sfd, ITEM_key(it)); + + stats_get_hits++; + *(c->ilist + i) = it; + i++; + + } else { + stats_get_misses++; + } + + key_token++; + } + + /* + * If the command string hasn't been fully processed, get the next set + * of tokens. + */ + if(key_token->value != NULL) { + ntokens = tokenize_command(key_token->value, tokens, MAX_TOKENS); + key_token = tokens; + } + + } while(key_token->value != NULL); + + c->icurr = c->ilist; + c->ileft = i; + + if (settings.verbose > 1) + fprintf(stderr, ">%d END\n", c->sfd); + + /* + If the loop was terminated because of out-of-memory, it is not + reliable to add END\r\n to the buffer, because it might not end + in \r\n. So we send SERVER_ERROR instead. + */ + if (key_token->value != NULL || add_iov(c, "END\r\n", 5) != 0) { + out_string(c, "SERVER_ERROR out of memory writing get response"); + } + else { + conn_set_state(c, conn_mwrite); + c->msgcurr = 0; + } + + STATS_LOCK(); + stats.get_cmds += stats_get_cmds; + stats.get_hits += stats_get_hits; + stats.get_misses += stats_get_misses; + STATS_UNLOCK(); + + return; +} + +static void process_update_command(conn *c, token_t *tokens, const size_t ntokens, int comm) { + char *key; + size_t nkey; + int flags; + time_t exptime; + int vlen; + item *it = NULL; + + assert(c != NULL); + + set_noreply_maybe(c, tokens, ntokens); + + if (tokens[KEY_TOKEN].length > KEY_MAX_LENGTH) { + out_string(c, "CLIENT_ERROR bad command line format"); + return; + } + + key = tokens[KEY_TOKEN].value; + nkey = tokens[KEY_TOKEN].length; + + flags = strtoul(tokens[2].value, NULL, 10); + exptime = strtol(tokens[3].value, NULL, 10); + vlen = strtol(tokens[4].value, NULL, 10); + + if(errno == ERANGE || ((flags == 0 || exptime == 0) && errno == EINVAL) + || vlen < 0) { + out_string(c, "CLIENT_ERROR bad command line format"); + return; + } + + it = item_alloc1(key, nkey, flags, vlen+2); + it->expire = exptime; + it->flag = flags; + + if (it == NULL) { + out_string(c, "SERVER_ERROR out of memory storing object"); + /* swallow the data line */ + c->write_and_go = conn_swallow; + c->sbytes = vlen + 2; + return; + } + + c->item = it; + c->ritem = ITEM_data(it); + c->rlbytes = it->nbytes; + c->item_comm = comm; + conn_set_state(c, conn_nread); +} + +bool safe_strtoull(const char *str, uint64_t *out) { + assert(out != NULL); + errno = 0; + *out = 0; + char *endptr; + unsigned long long ull = strtoull(str, &endptr, 10); + if (errno == ERANGE) + return false; + if (isspace(*endptr) || (*endptr == '\0' && endptr != str)) { + *out = ull; + return true; + } + return false; +} + +/* + + +static void process_arithmetic_command(conn *c, token_t *tokens, const size_t ntokens, const bool incr) { + char temp[INCR_MAX_STORAGE_LEN]; + uint64_t delta; + char *key; + size_t nkey; + + assert(c != NULL); + + set_noreply_maybe(c, tokens, ntokens); + + STATS_LOCK(); + stats.set_cmds++; + STATS_UNLOCK(); + + if (tokens[KEY_TOKEN].length > KEY_MAX_LENGTH) { + out_string(c, "CLIENT_ERROR bad command line format"); + return; + } + + key = tokens[KEY_TOKEN].value; + nkey = tokens[KEY_TOKEN].length; + + if (!safe_strtoull(tokens[2].value, &delta)) { + out_string(c, "CLIENT_ERROR invalid numeric delta argument"); + return; + } + + switch(add_delta(key, nkey, delta, temp)) { + case 0: + out_string(c, temp); + break; +// case NON_NUMERIC: +// out_string(c, "CLIENT_ERROR cannot increment or decrement non-numeric value"); +// break; +// case EOM: +// out_string(c, "SERVER_ERROR out of memory"); +// break; + } +} +*/ + + +static void process_delete_command(conn *c, token_t *tokens, const size_t ntokens) { + char *key; + size_t nkey; + assert(c != NULL); + + set_noreply_maybe(c, tokens, ntokens); + + STATS_LOCK(); + stats.delete_cmds++; + STATS_UNLOCK(); + + key = tokens[KEY_TOKEN].value; + nkey = tokens[KEY_TOKEN].length; + if(nkey > KEY_MAX_LENGTH) { + out_string(c, "CLIENT_ERROR bad command line format"); + return; + } + + switch (cdb_del(db, key, nkey)) { + case 0: + out_string(c, "DELETED"); + break; + case -3: + out_string(c, "NOT_FOUND"); + break; +// case -1: +// out_string(c, "SERVER_ERROR while delete a item"); +// break; +// default: +// out_string(c, "SERVER_ERROR nothing to do"); + } + return; +} + +static void process_verbosity_command(conn *c, token_t *tokens, const size_t ntokens) { + unsigned int level; + + assert(c != NULL); + + set_noreply_maybe(c, tokens, ntokens); + + level = strtoul(tokens[1].value, NULL, 10); + if(errno == ERANGE) { + out_string(c, "CLIENT_ERROR bad command line format"); + return; + } + settings.verbose = level > MAX_VERBOSITY_LEVEL ? MAX_VERBOSITY_LEVEL : level; + out_string(c, "OK"); + return; +} + +static void process_command(conn *c, char *command) { + + token_t tokens[MAX_TOKENS]; + size_t ntokens; + int comm; + struct timespec start, end; + + assert(c != NULL); + + if (settings.verbose > 1) + fprintf(stderr, "<%d %s\n", c->sfd, command); + + /* + * for commands set/add/replace, we build an item and read the data + * directly into it, then continue in nread_complete(). + */ + + c->msgcurr = 0; + c->msgused = 0; + c->iovused = 0; + if (add_msghdr(c) != 0) { + out_string(c, "SERVER_ERROR out of memory preparing response"); + return; + } + + clock_gettime(CLOCK_MONOTONIC, &start); + + ntokens = tokenize_command(command, tokens, MAX_TOKENS); + if (ntokens >= 3 && + (strcmp(tokens[COMMAND_TOKEN].value, "get") == 0) ) { + + process_get_command(c, tokens, ntokens); + + } else if ((ntokens == 6 || ntokens == 7) && + ((strcmp(tokens[COMMAND_TOKEN].value, "set") == 0 && (comm = NREAD_SET)) || + (strcmp(tokens[COMMAND_TOKEN].value, "add") == 0 && (comm = NREAD_ADD)) || + (strcmp(tokens[COMMAND_TOKEN].value, "replace") == 0 && (comm = NREAD_REPLACE)))) { + + process_update_command(c, tokens, ntokens, comm); + +// } else if ((ntokens == 4 || ntokens == 5) && (strcmp(tokens[COMMAND_TOKEN].value, "incr") == 0)) { + +// process_arithmetic_command(c, tokens, ntokens, 1); + + } else if (ntokens >= 3 && ntokens <= 4 && (strcmp(tokens[COMMAND_TOKEN].value, "delete") == 0)) { + + process_delete_command(c, tokens, ntokens); + + } else if (ntokens >= 2 && (strcmp(tokens[COMMAND_TOKEN].value, "stats") == 0)) { + + process_stat(c, tokens, ntokens); + + } else if (ntokens == 2 && (strcmp(tokens[COMMAND_TOKEN].value, "version") == 0)) { + + out_string(c, "VERSION " VERSION); + + } else if (ntokens == 2 && (strcmp(tokens[COMMAND_TOKEN].value, "quit") == 0)) { + + conn_set_state(c, conn_closing); + + } else if (ntokens == 3 && (strcmp(tokens[COMMAND_TOKEN].value, "verbosity") == 0)) { + + process_verbosity_command(c, tokens, ntokens); + +/* } else if (ntokens >= 2 && ntokens <= 4 && (strcmp(tokens[COMMAND_TOKEN].value, "flush_all") == 0)) { + + set_noreply_maybe(c, tokens, ntokens); + + int limit = 10000; + if (ntokens == (c->noreply ? 4 : 3)) { + limit = strtol(tokens[1].value, NULL, 10); + if(errno == ERANGE) { + out_string(c, "CLIENT_ERROR bad command line format"); + return; + } + } + + hs_optimize(db, limit); + out_string(c, "OK"); + return; +*/ + } else { + out_string(c, "ERROR"); + return; + } + + clock_gettime(CLOCK_MONOTONIC, &end); + float secs = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) / 1e9; + if (secs > settings.slow_cmd_time) { + STATS_LOCK(); + stats.slow_cmds ++; + STATS_UNLOCK(); + } + + // access logging + if (NULL != access_log && ntokens >= 3) { + char now[255]; + time_t t = time(NULL); + strftime(now, 200, "%Y-%m-%d %H:%M:%S", localtime(&t)); + struct sockaddr_storage addr; + socklen_t addrlen = sizeof(addr); + getpeername(c->sfd, (struct sockaddr*)&addr, &addrlen); + char host[NI_MAXHOST], serv[NI_MAXSERV]; + getnameinfo((struct sockaddr*)&addr, addrlen, host, sizeof(host), serv, sizeof(serv), + NI_NUMERICSERV); + fprintf(access_log, "%s %s:%s %s %s %.3f\n", now, host, serv, + command, tokens[1].value, secs*1000); + } + + return; +} + +/* + * if we have a complete line in the buffer, process it. + */ +static int try_read_command(conn *c) { + char *el, *cont; + + assert(c != NULL); + assert(c->rcurr <= (c->rbuf + c->rsize)); + + if (c->rbytes == 0) + return 0; + el = memchr(c->rcurr, '\n', c->rbytes); + if (!el) + return 0; + cont = el + 1; + if ((el - c->rcurr) > 1 && *(el - 1) == '\r') { + el--; + } + *el = '\0'; + + assert(cont <= (c->rcurr + c->rbytes)); + + process_command(c, c->rcurr); + + c->rbytes -= (cont - c->rcurr); + c->rcurr = cont; + + assert(c->rcurr <= (c->rbuf + c->rsize)); + + return 1; +} + +/* + * read from network as much as we can, handle buffer overflow and connection + * close. + * before reading, move the remaining incomplete fragment of a command + * (if any) to the beginning of the buffer. + * return 0 if there's nothing to read on the first read. + */ +static int try_read_network(conn *c) { + int gotdata = 0; + int res; + + assert(c != NULL); + + if (c->rcurr != c->rbuf) { + if (c->rbytes != 0) /* otherwise there's nothing to copy */ + memmove(c->rbuf, c->rcurr, c->rbytes); + c->rcurr = c->rbuf; + } + + while (1) { + if (c->rbytes >= c->rsize) { + char *new_rbuf = realloc(c->rbuf, c->rsize * 2); + if (!new_rbuf) { + if (settings.verbose > 0) + fprintf(stderr, "Couldn't realloc input buffer\n"); + c->rbytes = 0; /* ignore what we read */ + out_string(c, "SERVER_ERROR out of memory reading request"); + c->write_and_go = conn_closing; + return 1; + } + c->rcurr = c->rbuf = new_rbuf; + c->rsize *= 2; + } + + + int avail = c->rsize - c->rbytes; + res = read(c->sfd, c->rbuf + c->rbytes, avail); + if (res > 0) { + STATS_LOCK(); + stats.bytes_read += res; + STATS_UNLOCK(); + gotdata = 1; + c->rbytes += res; + if (res == avail) { + continue; + } else { + break; + } + } + if (res == 0) { + /* connection closed */ + conn_set_state(c, conn_closing); + return 1; + } + if (res == -1) { + if (errno == EAGAIN || errno == EWOULDBLOCK) break; + /* Should close on unhandled errors. */ + conn_set_state(c, conn_closing); + return 1; + } + } + return gotdata; +} + +static bool update_event(conn *c, const int new_flags) { + c->ev_flags = new_flags; + return true; +} + +/* + * Transmit the next chunk of data from our list of msgbuf structures. + * + * Returns: + * TRANSMIT_COMPLETE All done writing. + * TRANSMIT_INCOMPLETE More data remaining to write. + * TRANSMIT_SOFT_ERROR Can't write any more right now. + * TRANSMIT_HARD_ERROR Can't write (c->state is set to conn_closing) + */ +static int transmit(conn *c) { + assert(c != NULL); + + if (c->msgcurr < c->msgused && + c->msglist[c->msgcurr].msg_iovlen == 0) { + /* Finished writing the current msg; advance to the next. */ + c->msgcurr++; + } + if (c->msgcurr < c->msgused) { + ssize_t res; + struct msghdr *m = &c->msglist[c->msgcurr]; + + res = sendmsg(c->sfd, m, 0); + if (res > 0) { + STATS_LOCK(); + stats.bytes_written += res; + STATS_UNLOCK(); + + /* We've written some of the data. Remove the completed + iovec entries from the list of pending writes. */ + while (m->msg_iovlen > 0 && res >= m->msg_iov->iov_len) { + res -= m->msg_iov->iov_len; + m->msg_iovlen--; + m->msg_iov++; + } + + /* Might have written just part of the last iovec entry; + adjust it so the next write will do the rest. */ + if (res > 0) { + m->msg_iov->iov_base += res; + m->msg_iov->iov_len -= res; + } + return TRANSMIT_INCOMPLETE; + } + if (res == -1 && (errno == EAGAIN || errno == EWOULDBLOCK)) { + update_event(c, AE_WRITABLE); + return TRANSMIT_SOFT_ERROR; + } + /* if res==0 or res==-1 and error is not EAGAIN or EWOULDBLOCK, + we have a real error, on which we close the connection */ + if (settings.verbose > 0) + perror("Failed to write, and not due to blocking"); + + conn_set_state(c, conn_closing); + return TRANSMIT_HARD_ERROR; + } else { + return TRANSMIT_COMPLETE; + } +} + +void drive_machine(conn *c) { + bool stop = false; + int sfd, flags = 1; + socklen_t addrlen; + struct sockaddr_storage addr; + int res; + + assert(c != NULL); + + while (!stop) { + + switch(c->state) { + case conn_listening: + addrlen = sizeof(addr); + if ((sfd = accept(c->sfd, (struct sockaddr *)&addr, &addrlen)) == -1) { + stop = true; + if (errno == EAGAIN || errno == EWOULDBLOCK) { + /* these are transient, so don't log anything */ + } else if (errno == EMFILE) { + if (settings.verbose > 0) + fprintf(stderr, "Too many open connections\n"); + if (stub_fd > 0){ + close(stub_fd); + if ((sfd = accept(c->sfd, (struct sockaddr *)&addr, &addrlen)) != -1) { + close(sfd); + stub_fd = open("/dev/null", O_RDONLY); + stop = false; + }else{ + if (settings.verbose > 0) + fprintf(stderr, "Too many open connections 2\n"); + } + } + } else { + perror("accept()"); + } + if (stop) break; + } + if ((flags = fcntl(sfd, F_GETFL, 0)) < 0 || + fcntl(sfd, F_SETFL, flags | O_NONBLOCK) < 0) { + perror("setting O_NONBLOCK"); + close(sfd); + break; + } + if (NULL == conn_new(sfd, conn_read, DATA_BUFFER_SIZE)) { + if (settings.verbose > 0) { + fprintf(stderr, "Can't listen for events on fd %d\n", sfd); + } + close(sfd); + } + break; + + case conn_read: + if (try_read_command(c) != 0) { + continue; + } + if (try_read_network(c) != 0) { + continue; + } + /* we have no command line and no data to read from network */ + update_event(c, AE_READABLE); + stop = true; + break; + + case conn_nread: + /* we are reading rlbytes into ritem; */ + if (c->rlbytes == 0) { + complete_nread(c); + break; + } + /* first check if we have leftovers in the conn_read buffer */ + if (c->rbytes > 0) { + int tocopy = c->rbytes > c->rlbytes ? c->rlbytes : c->rbytes; + memcpy(c->ritem, c->rcurr, tocopy); + c->ritem += tocopy; + c->rlbytes -= tocopy; + c->rcurr += tocopy; + c->rbytes -= tocopy; + break; + } + + /* now try reading from the socket */ + res = read(c->sfd, c->ritem, c->rlbytes); + if (res > 0) { + STATS_LOCK(); + stats.bytes_read += res; + STATS_UNLOCK(); + c->ritem += res; + c->rlbytes -= res; + break; + } + if (res == 0) { /* end of stream */ + conn_set_state(c, conn_closing); + break; + } + if (res == -1 && (errno == EAGAIN || errno == EWOULDBLOCK)) { + update_event(c, AE_READABLE); + stop = true; + break; + } + /* otherwise we have a real error, on which we close the connection */ + if (settings.verbose > 0) + fprintf(stderr, "Failed to read, and not due to blocking\n"); + conn_set_state(c, conn_closing); + break; + + case conn_swallow: + /* we are reading sbytes and throwing them away */ + if (c->sbytes == 0) { + conn_set_state(c, conn_read); + break; + } + + /* first check if we have leftovers in the conn_read buffer */ + if (c->rbytes > 0) { + int tocopy = c->rbytes > c->sbytes ? c->sbytes : c->rbytes; + c->sbytes -= tocopy; + c->rcurr += tocopy; + c->rbytes -= tocopy; + break; + } + + /* now try reading from the socket */ + res = read(c->sfd, c->rbuf, c->rsize > c->sbytes ? c->sbytes : c->rsize); + if (res > 0) { + STATS_LOCK(); + stats.bytes_read += res; + STATS_UNLOCK(); + c->sbytes -= res; + break; + } + if (res == 0) { /* end of stream */ + conn_set_state(c, conn_closing); + break; + } + if (res == -1 && (errno == EAGAIN || errno == EWOULDBLOCK)) { + update_event(c, AE_READABLE); + stop = true; + break; + } + /* otherwise we have a real error, on which we close the connection */ + if (settings.verbose > 0) + fprintf(stderr, "Failed to read, and not due to blocking\n"); + conn_set_state(c, conn_closing); + break; + + case conn_write: + /* + * We want to write out a simple response. If we haven't already, + * assemble it into a msgbuf list (this will be a single-entry + * list for TCP or a two-entry list for UDP). + */ + if (c->iovused == 0) { + if (add_iov(c, c->wcurr, c->wbytes) != 0) { + if (settings.verbose > 0) + fprintf(stderr, "Couldn't build response\n"); + conn_set_state(c, conn_closing); + break; + } + } + + /* fall through... */ + + case conn_mwrite: + switch (transmit(c)) { + case TRANSMIT_COMPLETE: + if (c->state == conn_mwrite) { + while (c->ileft > 0) { + item *it = *(c->icurr); + item_free(it); + c->icurr++; + c->ileft--; + } + conn_set_state(c, conn_read); + } else if (c->state == conn_write) { + if (c->write_and_free) { + free(c->write_and_free); + c->write_and_free = 0; + } + conn_set_state(c, c->write_and_go); + } else { + if (settings.verbose > 0) + fprintf(stderr, "Unexpected state %d\n", c->state); + conn_set_state(c, conn_closing); + } + break; + + case TRANSMIT_INCOMPLETE: + case TRANSMIT_HARD_ERROR: + break; /* Continue in state machine. */ + + case TRANSMIT_SOFT_ERROR: + stop = true; + break; + } + break; + + case conn_closing: + conn_close(c); + stop = true; + break; + } + } + + return; +} + +static int new_socket(struct addrinfo *ai) { + int sfd; + int flags; + + if ((sfd = socket(ai->ai_family, ai->ai_socktype, ai->ai_protocol)) == -1) { + perror("socket()"); + return -1; + } + + if ((flags = fcntl(sfd, F_GETFL, 0)) < 0 || + fcntl(sfd, F_SETFL, flags | O_NONBLOCK) < 0) { + perror("setting O_NONBLOCK"); + close(sfd); + return -1; + } + return sfd; +} + +static int server_socket(const int port, const bool is_udp) { + int sfd; + struct linger ling = {0, 0}; + struct addrinfo *ai; + struct addrinfo *next; + struct addrinfo hints; + char port_buf[NI_MAXSERV]; + int error; + int success = 0; + + int flags =1; + + /* + * the memset call clears nonstandard fields in some impementations + * that otherwise mess things up. + */ + memset(&hints, 0, sizeof (hints)); + hints.ai_flags = AI_PASSIVE|AI_ADDRCONFIG; + hints.ai_family = AF_UNSPEC; + hints.ai_protocol = IPPROTO_TCP; + hints.ai_socktype = SOCK_STREAM; + + snprintf(port_buf, NI_MAXSERV, "%d", port); + error= getaddrinfo(settings.inter, port_buf, &hints, &ai); + if (error != 0) { + if (error != EAI_SYSTEM) + fprintf(stderr, "getaddrinfo(): %s\n", gai_strerror(error)); + else + perror("getaddrinfo()"); + + return 1; + } + + for (next= ai; next; next= next->ai_next) { + conn *listen_conn_add; + if ((sfd = new_socket(next)) == -1) { + freeaddrinfo(ai); + return 1; + } + + setsockopt(sfd, SOL_SOCKET, SO_REUSEADDR, (void *)&flags, sizeof(flags)); + setsockopt(sfd, SOL_SOCKET, SO_KEEPALIVE, (void *)&flags, sizeof(flags)); + setsockopt(sfd, SOL_SOCKET, SO_LINGER, (void *)&ling, sizeof(ling)); + setsockopt(sfd, IPPROTO_TCP, TCP_NODELAY, (void *)&flags, sizeof(flags)); + + if (bind(sfd, next->ai_addr, next->ai_addrlen) == -1) { + if (errno != EADDRINUSE) { + perror("bind()"); + close(sfd); + freeaddrinfo(ai); + return 1; + } + close(sfd); + continue; + } else { + success++; + if (listen(sfd, 1024) == -1) { + perror("listen()"); + close(sfd); + freeaddrinfo(ai); + return 1; + } + } + + if (!(listen_conn_add = conn_new(sfd, conn_listening, 1))) { + fprintf(stderr, "failed to create listening connection\n"); + exit(EXIT_FAILURE); + } + } + + freeaddrinfo(ai); + + /* Return zero iff we detected no errors in starting up connections */ + return success == 0; +} + +static void usage(void) { + printf(PACKAGE " " VERSION "\n"); + printf("-p <num> TCP port number to listen on (default: 8964)\n" + "-l <ip_addr> interface to listen on, default is INDRR_ANY\n" + "-d run as a daemon\n" + "-P <num> page cache limit(MB), default 256(MB)\n" + "-r <num> record cache limit(MB), default 256(MB)\n" + "-R <num> bytes for a disk read operation, must be between[1024,65535), recommend to be larger than most small records, default is 4096(Bytes)\n" + "-L <file> log file\n" + "-u <username> assume identity of <username> (only when run as root)\n" + "-c <num> max simultaneous connections, default is 1024\n" + "-t <num> number of threads to use, default 16\n" + "-H <dir> home of database, default is 'testdb', keep sure the directory exists\n" + "-s <num> slow command time limit, in ms, default is 100ms\n" + "-n main hash table size, recommend to be 1%% - 10%% of maximum record num, default is 1000000\n" + "-v verbose (print errors/warnings while in event loop)\n" + "-vv very verbose (also print client commands/reponses)\n" + "-h print this help and exit\n" + "-i print license info\n" + ); + + return; +} + +static void usage_license(void) { + printf(PACKAGE " " VERSION "\n\n"); + printf( + "Copyright (c) 2012, Siyuan Fu. <fusiyuan2010@gmail.com>\n" + "All rights reserved.\n" + "\n" + "\n" + "This product includes software developed by Douban Inc.\n" + "\n" + "[ Beansdb ]\n" + "\n" + "Copyright (c) 2009, Douban Inc. <http://www.douban.com/>\n" + "All rights reserved.\n" + "\n" + "Redistribution and use in source and binary forms, with or without\n" + "modification, are permitted provided that the following conditions are\n" + "met:\n" + "\n" + " * Redistributions of source code must retain the above copyright\n" + "notice, this list of conditions and the following disclaimer.\n" + "\n" + " * Redistributions in binary form must reproduce the above\n" + "copyright notice, this list of conditions and the following disclaimer\n" + "in the documentation and/or other materials provided with the\n" + "distribution.\n" + "\n" + " * Neither the name of the Douban Inc. nor the names of its\n" + "contributors may be used to endorse or promote products derived from\n" + "this software without specific prior written permission.\n" + "\n" + "THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS\n" + "\"AS IS\" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT\n" + "LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR\n" + "A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT\n" + "OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\n" + "SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT\n" + "LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\n" + "DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY\n" + "THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n" + "(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n" + "OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n" + "\n" + "\n" + "This product includes software developed by Douban Inc.\n" + "\n" + "[ memcachedb ]\n" + "\n" + "Copyright (c) 2008, Steve Chu. <stvchu@gmail.com>\n" + "All rights reserved.\n" + "\n" + "Redistribution and use in source and binary forms, with or without\n" + "modification, are permitted provided that the following conditions are\n" + "met:\n" + "\n" + " * Redistributions of source code must retain the above copyright\n" + "notice, this list of conditions and the following disclaimer.\n" + "\n" + " * Redistributions in binary form must reproduce the above\n" + "copyright notice, this list of conditions and the following disclaimer\n" + "in the documentation and/or other materials provided with the\n" + "distribution.\n" + "\n" + " * Neither the name of the Danga Interactive nor the names of its\n" + "contributors may be used to endorse or promote products derived from\n" + "this software without specific prior written permission.\n" + "\n" + "THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS\n" + "\"AS IS\" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT\n" + "LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR\n" + "A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT\n" + "OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\n" + "SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT\n" + "LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\n" + "DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY\n" + "THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n" + "(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n" + "OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n" + "\n" + "\n" + "This product includes software developed by Danga Interactive, Inc.\n" + "\n" + "[ memcached ]\n" + "\n" + "Copyright (c) 2003, Danga Interactive, Inc. <http://www.danga.com/>\n" + "All rights reserved.\n" + "\n" + "Redistribution and use in source and binary forms, with or without\n" + "modification, are permitted provided that the following conditions are\n" + "met:\n" + "\n" + " * Redistributions of source code must retain the above copyright\n" + "notice, this list of conditions and the following disclaimer.\n" + "\n" + " * Redistributions in binary form must reproduce the above\n" + "copyright notice, this list of conditions and the following disclaimer\n" + "in the documentation and/or other materials provided with the\n" + "distribution.\n" + "\n" + " * Neither the name of the Danga Interactive nor the names of its\n" + "contributors may be used to endorse or promote products derived from\n" + "this software without specific prior written permission.\n" + "\n" + "THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS\n" + "\"AS IS\" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT\n" + "LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR\n" + "A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT\n" + "OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\n" + "SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT\n" + "LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\n" + "DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY\n" + "THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n" + "(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n" + "OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n" + "\n" + "\n" + "This product includes software developed by Niels Provos.\n" + "\n" + "[ libevent ]\n" + "\n" + "Copyright 2000-2003 Niels Provos <provos@citi.umich.edu>\n" + "All rights reserved.\n" + "\n" + "Redistribution and use in source and binary forms, with or without\n" + "modification, are permitted provided that the following conditions\n" + "are met:\n" + "1. Redistributions of source code must retain the above copyright\n" + " notice, this list of conditions and the following disclaimer.\n" + "2. Redistributions in binary form must reproduce the above copyright\n" + " notice, this list of conditions and the following disclaimer in the\n" + " documentation and/or other materials provided with the distribution.\n" + "3. All advertising materials mentioning features or use of this software\n" + " must display the following acknowledgement:\n" + " This product includes software developed by Niels Provos.\n" + "4. The name of the author may not be used to endorse or promote products\n" + " derived from this software without specific prior written permission.\n" + "\n" + "THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR\n" + "IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES\n" + "OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.\n" + "IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,\n" + "INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT\n" + "NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\n" + "DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY\n" + "THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n" + "(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF\n" + "THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n" + ); + + return; +} + + +/* for safely exit, make sure to do checkpoint*/ +static void sig_handler(const int sig) +{ + if (sig != SIGTERM && sig != SIGQUIT && sig != SIGINT) { + return; + } + if (daemon_quit == 1){ + return; + } + daemon_quit = 1; + fprintf(stderr, "Signal(%d) received, try to exit daemon gracefully..\n", sig); +} + + +int main (int argc, char **argv) { + int c; + //struct in_addr addr; + char *dbhome = "testdb"; + bool daemonize = false; + char *username = NULL; + FILE *log_file = NULL; + struct passwd *pw; + struct sigaction sa; + struct rlimit rlim; + int rcache = 256, pcache = 256; + /* recommend for 100,000,000 records*/ + int db_hsize = 1000000; + int areadsize = 4096; + + /* init settings */ + settings_init(); + + /* set stderr non-buffering (for running under, say, daemontools) */ + setbuf(stderr, NULL); + + /* process arguments */ + while ((c = getopt(argc, argv, "a:p:c:hivl:dr:u:P:L:t:b:H:s:n:R:")) != -1) { + switch (c) { + case 'a': + if (strcmp(optarg, "-") == 0) { + access_log = stdout; + }else{ + access_log = fopen(optarg, "a"); + if (NULL == access_log) { + fprintf(stderr, "open access_log %s failed\n", optarg); + exit(1); + } + } + break; + case 'p': + settings.port = atoi(optarg); + break; + case 'c': + settings.maxconns = atoi(optarg); + break; + case 'h': + usage(); + exit(EXIT_SUCCESS); + case 'i': + usage_license(); + exit(EXIT_SUCCESS); + case 'v': + settings.verbose++; + break; + case 'l': + settings.inter= strdup(optarg); + break; + case 'd': + daemonize = true; + break; + case 'r': + rcache = atoi(optarg); + break; + case 'R': + areadsize = atoi(optarg); + break; + case 'u': + username = optarg; + break; + case 'P': + pcache = atoi(optarg); + break; + case 'L': + if ((log_file = fopen(optarg, "a")) != NULL){ + setlinebuf(log_file); + fclose(stdout); + fclose(stderr); + stdout = stderr = log_file; + }else{ + fprintf(stderr, "open log file %s failed\n", optarg); + } + break; + case 't': + settings.num_threads = atoi(optarg); + if (settings.num_threads == 0) { + fprintf(stderr, "Number of threads must be greater than 0\n"); + exit(EXIT_FAILURE); + } + break; + case 'b': + settings.item_buf_size = atoi(optarg); + if(settings.item_buf_size < 512){ + fprintf(stderr, "item buf size must be larger than 512 bytes\n"); + exit(EXIT_FAILURE); + } + if(settings.item_buf_size > 256 * 1024){ + fprintf(stderr, "Warning: item buffer size(-b) larger than 256KB may cause performance issue\n"); + } + break; + case 'H': + dbhome = optarg; + break; + case 's': + settings.slow_cmd_time = atoi(optarg) / 1000.0; + break; + case 'n': + db_hsize = atoi(optarg); + break; + default: + fprintf(stderr, "Illegal argument \"%c\"\n", c); + exit(EXIT_FAILURE); + } + } + + /* + * If needed, increase rlimits to allow as many connections + * as needed. + */ + + if (getrlimit(RLIMIT_NOFILE, &rlim) != 0) { + fprintf(stderr, "failed to getrlimit number of files\n"); + exit(EXIT_FAILURE); + } else { + int maxfiles = settings.maxconns; + if (rlim.rlim_cur < maxfiles) + rlim.rlim_cur = maxfiles + 3; + if (rlim.rlim_max < rlim.rlim_cur) + rlim.rlim_max = rlim.rlim_cur; + if (setrlimit(RLIMIT_NOFILE, &rlim) != 0) { + fprintf(stderr, "failed to set rlimit for open files. Try running as root or requesting smaller maxconns value.\n"); + exit(EXIT_FAILURE); + } + } + + /* daemonize if requested */ + /* if we want to ensure our ability to dump core, don't chdir to / */ + if (daemonize) { + int res; + res = daemon(1, settings.verbose || log_file); + if (res == -1) { + fprintf(stderr, "failed to daemon() in order to daemonize\n"); + return 1; + } + } + + /* lose root privileges if we have them */ + if (getuid() == 0 || geteuid() == 0) { + if (username == 0 || *username == '\0') { + fprintf(stderr, "can't run as root without the -u switch\n"); + return 1; + } + if ((pw = getpwnam(username)) == 0) { + fprintf(stderr, "can't find the user %s to switch to\n", username); + return 1; + } + if (setgid(pw->pw_gid) < 0 || setuid(pw->pw_uid) < 0) { + fprintf(stderr, "failed to assume identity of user %s\n", username); + return 1; + } + } + + /* initialize other stuff */ + item_init(); + stats_init(); + conn_init(); + + /* + * ignore SIGPIPE signals; we can use errno==EPIPE if we + * need that information + */ + sa.sa_handler = SIG_IGN; + sa.sa_flags = 0; + if (sigemptyset(&sa.sa_mask) == -1 || + sigaction(SIGPIPE, &sa, 0) == -1) { + perror("failed to ignore SIGPIPE; sigaction"); + exit(EXIT_FAILURE); + } + + /* open db */ + db = cdb_new(); + cdb_option(db, db_hsize, rcache, pcache); + cdb_option_areadsize(db, areadsize); + + if (cdb_open(db, dbhome, CDB_CREAT | CDB_PAGEWARMUP) < 0) { + fprintf(stderr, "failed to open db %s\n", dbhome); + exit(1); + } + + if ((stub_fd = open("/dev/null", O_RDONLY)) == -1) { + perror("open stub file failed"); + exit(1); + } + thread_init(settings.num_threads); + + /* create the listening socket, bind it, and init */ + if (server_socket(settings.port, false)) { + fprintf(stderr, "failed to listen\n"); + exit(EXIT_FAILURE); + } + + /* register signal callback */ + if (signal(SIGTERM, sig_handler) == SIG_ERR) + fprintf(stderr, "can not catch SIGTERM\n"); + if (signal(SIGQUIT, sig_handler) == SIG_ERR) + fprintf(stderr, "can not catch SIGQUIT\n"); + if (signal(SIGINT, sig_handler) == SIG_ERR) + fprintf(stderr, "can not catch SIGINT\n"); + + /* enter the event loop */ + printf("all ready.\n"); + loop_run(settings.num_threads); + + /* wait other thread to ends */ + fprintf(stderr, "waiting for close ... \n"); + cdb_destroy(db); + fprintf(stderr, "done.\n"); + + if (log_file) { + fclose(log_file); + } + + return 0; +} + diff --git a/libdap-cuttdb/src/cuttdb-server.h b/libdap-cuttdb/src/cuttdb-server.h new file mode 100644 index 0000000000000000000000000000000000000000..90cc9b6271683c058dfacd007fad43ffef239c55 --- /dev/null +++ b/libdap-cuttdb/src/cuttdb-server.h @@ -0,0 +1,270 @@ +/* + * Beansdb - A high available distributed key-value storage system: + * + * http://beansdb.googlecode.com + * + * The source code of Beansdb is most based on Memcachedb and Memcached: + * + * http://memcachedb.org/ + * http://danga.com/memcached/ + * + * Copyright 2009 Douban Inc. All rights reserved. + * + * Use and distribution licensed under the BSD license. See + * the LICENSE file for full text. + * + * Authors: + * Davies Liu <davies.liu@gmail.com> + * + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include <sys/types.h> +#include <sys/socket.h> +#include <sys/time.h> +#include <netinet/in.h> +#include <netdb.h> + +#define DATA_BUFFER_SIZE 2048 +#define MAX_PAYLOAD_SIZE 1400 +#define MAX_SENDBUF_SIZE (256 * 1024 * 1024) +/* I'm told the max legnth of a 64-bit num converted to string is 20 bytes. + * Plus a few for spaces, \r\n, \0 */ +#define SUFFIX_SIZE 24 +#define INCR_MAX_STORAGE_LEN 24 + +/** Initial size of list of items being returned by "get". */ +#define ITEM_LIST_INITIAL 200 + +/** Initial size of the sendmsg() scatter/gather array. */ +#define IOV_LIST_INITIAL 400 + +/** Initial number of sendmsg() argument structures to allocate. */ +#define MSG_LIST_INITIAL 10 + +/** High water marks for buffer shrinking */ +#define READ_BUFFER_HIGHWAT 8192 +#define ITEM_LIST_HIGHWAT 400 +#define IOV_LIST_HIGHWAT 600 +#define MSG_LIST_HIGHWAT 100 + +#define MAX_REP_PRIORITY 1000000 +#define MAX_REP_ACK_POLICY 6 +#define MAX_REP_NSITES 1000 + + +#define RGET_MAX_ITEMS 100 +#define PACKAGE "CuttDB" +#define VERSION "0.1.0" + +/* Get a consistent bool type */ +#include <stdbool.h> + +#if HAVE_STDINT_H +# include <stdint.h> +#else + typedef unsigned char uint8_t; +#endif + +/* unistd.h is here */ +#if HAVE_UNISTD_H +# include <unistd.h> +#endif + +/* 64-bit Portable printf */ +/* printf macros for size_t, in the style of inttypes.h */ +#ifdef _LP64 +#define __PRIS_PREFIX "z" +#else +#define __PRIS_PREFIX +#endif + +#define AE_SETSIZE (1024*60) /* Max number of fd supported */ + +#define AE_OK 0 +#define AE_ERR -1 + +#define AE_NONE 0 +#define AE_READABLE 1 +#define AE_WRITABLE 2 + +/* Use these macros after a % in a printf format string + to get correct 32/64 bit behavior, like this: + size_t size = records.size(); + printf("%"PRIuS"\n", size); */ + +#define PRIdS __PRIS_PREFIX "d" +#define PRIxS __PRIS_PREFIX "x" +#define PRIuS __PRIS_PREFIX "u" +#define PRIXS __PRIS_PREFIX "X" +#define PRIoS __PRIS_PREFIX "o" + +struct stats { + uint32_t curr_conns; + uint32_t total_conns; + uint32_t conn_structs; + uint64_t get_cmds; + uint64_t set_cmds; + uint64_t delete_cmds; + uint64_t slow_cmds; + uint64_t get_hits; + uint64_t get_misses; + time_t started; /* when the process was started */ + uint64_t bytes_read; + uint64_t bytes_written; +}; + +#define MAX_VERBOSITY_LEVEL 2 + +struct settings { + size_t item_buf_size; + int maxconns; + int port; + char *inter; + int verbose; + float slow_cmd_time; + int flush_period; + int flush_limit; + int num_threads; /* number of libevent threads to run */ +}; + +extern struct stats stats; +extern struct settings settings; + +typedef struct _stritem { + int expire; /* expire time */ + uint32_t flag; /* flag of item */ + int nbytes; /* size of data */ + uint8_t nsuffix; /* length of flags-and-length string */ + uint8_t nkey; /* key length, w/terminating null and padding */ + void * end[]; + /* then null-terminated key */ + /* then " flags length\r\n" (no terminating null) */ + /* then data with terminating \r\n (no terminating null; it's binary!) */ +} item; + +#define ITEM_key(item) ((char*)&((item)->end[0])) + +/* warning: don't use these macros with a function, as it evals its arg twice */ +#define ITEM_suffix(item) ((char*) &((item)->end[0]) + (item)->nkey + 1) +#define ITEM_data(item) ((char*) &((item)->end[0]) + (item)->nkey + 1 + (item)->nsuffix) +#define ITEM_ntotal(item) (sizeof(struct _stritem) + (item)->nkey + 1 + (item)->nsuffix + (item)->nbytes) + +enum conn_states { + conn_listening, /** the socket which listens for connections */ + conn_read, /** reading in a command line */ + conn_write, /** writing out a simple response */ + conn_nread, /** reading in a fixed number of bytes */ + conn_swallow, /** swallowing unnecessary bytes w/o storing */ + conn_closing, /** closing this connection */ + conn_mwrite, /** writing out many items sequentially */ +}; + +#define NREAD_ADD 1 +#define NREAD_SET 2 +#define NREAD_REPLACE 3 +#define NREAD_APPEND 4 +#define NREAD_PREPEND 5 + +typedef struct conn conn; +struct conn { + int sfd; + int state; + short ev_flags; + + char *rbuf; /** buffer to read commands into */ + char *rcurr; /** but if we parsed some already, this is where we stopped */ + int rsize; /** total allocated size of rbuf */ + int rbytes; /** how much data, starting from rcur, do we have unparsed */ + + char *wbuf; + char *wcurr; + int wsize; + int wbytes; + int write_and_go; /** which state to go into after finishing current write */ + void *write_and_free; /** free this memory after finishing writing */ + bool noreply; /* True if the reply should not be sent. */ + + char *ritem; /** when we read in an item's value, it goes here */ + int rlbytes; + + /* data for the nread state */ + + /** + * item is used to hold an item structure created after reading the command + * line of set/add/replace commands, but before we finished reading the actual + * data. The data is read into ITEM_data(item) to avoid extra copying. + */ + + void *item; /* for commands set/add/replace */ + int item_comm; /* which one is it: set/add/replace */ + + /* data for the swallow state */ + int sbytes; /* how many bytes to swallow */ + + /* data for the mwrite state */ + struct iovec *iov; + int iovsize; /* number of elements allocated in iov[] */ + int iovused; /* number of elements used in iov[] */ + + struct msghdr *msglist; + int msgsize; /* number of elements allocated in msglist[] */ + int msgused; /* number of elements used in msglist[] */ + int msgcurr; /* element in msglist[] being transmitted now */ + int msgbytes; /* number of bytes in current msg */ + + item **ilist; /* list of items to write out */ + int isize; + item **icurr; + int ileft; + + conn *next; /* Used for generating a list of conn structures */ +}; + +/* + * Functions + */ + +/* item management */ +/* +void item_init(void); +item *do_item_from_freelist(void); +int do_item_add_to_freelist(item *it); +item *item_alloc1(char *key, const size_t nkey, const int flags, const int nbytes); +int item_free(item *it); +item *item_get(char *key, size_t nkey); +*/ + +/* conn management */ +conn *do_conn_from_freelist(); +bool do_conn_add_to_freelist(conn *c); +conn *conn_new(const int sfd, const int init_state, const int read_buffer_size); + +int store_item(item *item, int comm); + +void thread_init(int nthreads); +int add_event(int fd, int mask, conn *c); +void loop_run(int nthreads); + +void drive_machine(conn *c); + +/* Lock wrappers for cache functions that are called from main loop. */ +conn *mt_conn_from_freelist(void); +bool mt_conn_add_to_freelist(conn *c); +item *mt_item_from_freelist(void); +int mt_item_add_to_freelist(item *it); +void mt_stats_lock(void); +void mt_stats_unlock(void); + +#define conn_from_freelist() mt_conn_from_freelist() +#define conn_add_to_freelist(x) mt_conn_add_to_freelist(x) +#define item_from_freelist() mt_item_from_freelist() +#define item_add_to_freelist(x) mt_item_add_to_freelist(x) +#define STATS_LOCK() mt_stats_lock() +#define STATS_UNLOCK() mt_stats_unlock() + +extern int daemon_quit; + diff --git a/libdap-cuttdb/src/cuttdb.c b/libdap-cuttdb/src/cuttdb.c new file mode 100644 index 0000000000000000000000000000000000000000..74e342623a5308fd4275868263eaac87c1c726e7 --- /dev/null +++ b/libdap-cuttdb/src/cuttdb.c @@ -0,0 +1,21 @@ +/* + * CuttDB - a fast key-value storage engine + * + * + * http://code.google.com/p/cuttdb/ + * + * Copyright (c) 2012, Siyuan Fu. All rights reserved. + * Use and distribution licensed under the BSD license. + * See the LICENSE file for full text + * + * Author: Siyuan Fu <fusiyuan2010@gmail.com> + * + */ + + +#include "cuttdb.h" +#include "cdb_types.h" +#include "cdb_vio.h" + + +/* nothing here */ diff --git a/cuttdb.h b/libdap-cuttdb/src/cuttdb.h similarity index 100% rename from cuttdb.h rename to libdap-cuttdb/src/cuttdb.h diff --git a/libdap-cuttdb/src/mman.c b/libdap-cuttdb/src/mman.c new file mode 100644 index 0000000000000000000000000000000000000000..ea5d358adc7768acd227507d7ac9e7853823729d --- /dev/null +++ b/libdap-cuttdb/src/mman.c @@ -0,0 +1,172 @@ +/* + * mman-win32 library + * https://code.google.com/p/mman-win32/ + * reinterpreted by Konstantin Papizh <konstantin.papizh@demlabs.net> + * DeM Labs Inc. https://demlabs.net + */ + +#include <windows.h> +#include <errno.h> +#include <stdio.h> +#include "mman.h" + +static DWORD __map_mmap_prot_page(const int prot) { + DWORD protect = 0; + + if (prot == PROT_NONE) + return protect; + + if ((prot & PROT_EXEC) != 0) { + protect = ((prot & PROT_WRITE) != 0) ? + PAGE_EXECUTE_READWRITE : PAGE_EXECUTE_READ; + } else { + protect = ((prot & PROT_WRITE) != 0) ? + PAGE_READWRITE : PAGE_READONLY; + } + return protect; +} + +static DWORD __map_mmap_prot_file(const int prot) { + + DWORD desiredAccess = 0; + if (prot == PROT_NONE) + return desiredAccess; + + if ((prot & PROT_READ) != 0) + desiredAccess |= FILE_MAP_READ; + if ((prot & PROT_WRITE) != 0) + desiredAccess |= FILE_MAP_WRITE; + if ((prot & PROT_EXEC) != 0) + desiredAccess |= FILE_MAP_EXECUTE; + + return desiredAccess; +} + +void* mmap(void *addr, size_t len, int prot, int flags, int fildes, offset_t off) +{ + HANDLE fm, h; + void *map = MAP_FAILED; + + const DWORD dwFileOffsetLow = (sizeof(offset_t) <= sizeof(DWORD)) ? + (DWORD)off : (DWORD)(off & 0xFFFFFFFFL); + const DWORD dwFileOffsetHigh = (sizeof(offset_t) <= sizeof(DWORD)) ? + (DWORD)0 : (DWORD)((off >> 32) & 0xFFFFFFFF00000000L); + const DWORD protect = __map_mmap_prot_page(prot); + const DWORD desiredAccess = __map_mmap_prot_file(prot); + + const offset_t maxSize = off + (offset_t)len; + + const DWORD dwMaxSizeLow = (sizeof(offset_t) <= sizeof(DWORD)) ? + (DWORD)maxSize : (DWORD)(maxSize & 0xFFFFFFFFL); + const DWORD dwMaxSizeHigh = (sizeof(offset_t) <= sizeof(DWORD)) ? + (DWORD)0 : (DWORD)((maxSize >> 32) & 0xFFFFFFFF00000000L); + _set_errno(0); + + if (len == 0 || prot == PROT_EXEC) { + _set_errno(EINVAL); + return MAP_FAILED; + } + + h = ((flags & MAP_ANONYMOUS) == 0) ? + (HANDLE)_get_osfhandle(fildes) : INVALID_HANDLE_VALUE; + + if ((flags & MAP_ANONYMOUS) == 0 && h == INVALID_HANDLE_VALUE) { + _set_errno(EBADF); + return MAP_FAILED; + } + + fm = CreateFileMapping(h, NULL, protect, dwMaxSizeHigh, dwMaxSizeLow, NULL); + + if (fm == NULL) { + int a = errno; + _set_errno(GetLastError()); + a = errno; + printf("%d", a); + return MAP_FAILED; + } + + if ((flags & MAP_FIXED) == 0) { + map = MapViewOfFile(fm, desiredAccess, dwFileOffsetHigh, dwFileOffsetLow, len); + } + else { + map = MapViewOfFileEx(fm, desiredAccess, dwFileOffsetHigh, dwFileOffsetLow, len, addr); + } + CloseHandle(fm); + + if (map == NULL) { + _set_errno(GetLastError()); + return MAP_FAILED; + } + return map; +} + +int munmap(void *addr, size_t len) { + if (UnmapViewOfFile(addr)) + return 0; + + _set_errno(GetLastError()); + return -1; +} + +int _mprotect(void *addr, size_t len, int prot) { + DWORD newProtect = __map_mmap_prot_page(prot); + DWORD oldProtect = 0; + + if (VirtualProtect(addr, len, newProtect, &oldProtect)) + return 0; + _set_errno(GetLastError()); + return -1; +} + +int msync(void *addr, size_t len, int flags) { + if (FlushViewOfFile(addr, len)) + return 0; + _set_errno(GetLastError()); + return -1; +} + +int mlock(const void *addr, size_t len) { + if (VirtualLock((LPVOID)addr, len)) + return 0; + _set_errno(GetLastError()); + return -1; +} + +int munlock(const void *addr, size_t len) { + if (VirtualUnlock((LPVOID)addr, len)) + return 0; + _set_errno(GetLastError()); + return -1; +} + +ssize_t pread(int fd, void *buf, unsigned long count, offset_t offset) { + unsigned long len = 0; + + OVERLAPPED overlapped; + memset(&overlapped, 0, sizeof(OVERLAPPED)); + overlapped.OffsetHigh = (uint32_t)((offset & 0xFFFFFFFF00000000LL) >> 32); + overlapped.Offset = (uint32_t)(offset & 0xFFFFFFFFLL); + + HANDLE file = (HANDLE)_get_osfhandle(fd); + if ((!ReadFile(file, buf, count, &len, &overlapped)) && GetLastError() != ERROR_HANDLE_EOF) { + _set_errno(GetLastError()); + return -1; + } + return len; +} + +ssize_t pwrite(int fd, const void *buf, unsigned long count, offset_t offset) { + long unsigned int len = 0; + + OVERLAPPED overlapped; + memset(&overlapped, 0, sizeof(OVERLAPPED)); + overlapped.OffsetHigh = (uint32_t)((offset & 0xFFFFFFFF00000000LL) >> 32); + overlapped.Offset = (uint32_t)(offset & 0xFFFFFFFFLL); + + HANDLE file = (HANDLE)_get_osfhandle(fd); + if (!WriteFile(file, buf, count, &len, &overlapped)) { + _set_errno(GetLastError()); + return -1; + } + return len; +} diff --git a/libdap-cuttdb/src/mman.h b/libdap-cuttdb/src/mman.h new file mode 100644 index 0000000000000000000000000000000000000000..b8bb8cb78d0fdf69037399dd5d2845b857e08624 --- /dev/null +++ b/libdap-cuttdb/src/mman.h @@ -0,0 +1,59 @@ +#ifndef _MMAN_H_ +#define _MMAN_H_ + +#ifndef _WIN32_WINNT +#define _WIN32_WINNT 0x0600 +#endif + +#include <_mingw.h> +#include <stdint.h> +#include <io.h> + +#if defined(_WIN64) +typedef int64_t offset_t; +#else +typedef uint32_t offset_t; +#endif + +#include <sys/types.h> +#include <stdbool.h> +#ifdef __cplusplus +extern "C" { +#endif + +#define PROT_NONE 0 +#define PROT_READ 1 +#define PROT_WRITE 2 +#define PROT_EXEC 4 + +#define MAP_FILE 0 +#define MAP_SHARED 1 +#define MAP_PRIVATE 2 +#define MAP_TYPE 0xf +#define MAP_FIXED 0x10 +#define MAP_ANONYMOUS 0x20 +#define MAP_ANON MAP_ANONYMOUS + +#define MAP_FAILED ((void *)-1) + +#define MS_ASYNC 1 +#define MS_SYNC 2 +#define MS_INVALIDATE 4 + +#define fdatasync(fd) _commit(fd) + +void* mmap(void *addr, size_t len, int prot, int flags, int fildes, offset_t offset); +int munmap(void *addr, size_t len); +int _mprotect(void *addr, size_t len, int prot); +int msync(void *addr, size_t len, int flags); +int mlock(const void *addr, size_t len); +int munlock(const void *addr, size_t len); + +ssize_t pread(int fd, void *buf, unsigned long count, offset_t offset); +ssize_t pwrite(int fd, const void *buf, unsigned long count, offset_t offset); + +#ifdef __cplusplus +} +#endif + +#endif /* _MMAN_H_ */ diff --git a/libdap-cuttdb/src/server-thread.c b/libdap-cuttdb/src/server-thread.c new file mode 100644 index 0000000000000000000000000000000000000000..c7a05c30319e63a177178ab43a84bc7b5435fb11 --- /dev/null +++ b/libdap-cuttdb/src/server-thread.c @@ -0,0 +1,217 @@ +/* + * CuttDB - a fast key-value storage engine + * + * + * http://code.google.com/p/cuttdb/ + * + * The server&network part of CuttDB is based on Beansdb: + * + * http://beansdb.googlecode.com + * + * Beansdb is most based on Memcachedb and Memcached: + * + * http://memcachedb.org/ + * http://danga.com/memcached/ + * + * Copyright (c) 2012, Siyuan Fu. All rights reserved. + * Use and distribution licensed under the BSD license. + * See the LICENSE file for full text + * + * Author: Siyuan Fu <fusiyuan2010@gmail.com> + * + */ + +#include "cuttdb-server.h" +#include <stdio.h> +#include <errno.h> +#include <stdlib.h> +#include <errno.h> +#include <assert.h> +#include <string.h> +#include <unistd.h> + +#ifdef HAVE_MALLOC_H +#include <malloc.h> +#endif + +#ifdef HAVE_STRING_H +#include <string.h> +#endif + +#include <pthread.h> + +typedef struct EventLoop { +// int maxfd; + conn* conns[AE_SETSIZE]; + int fired[AE_SETSIZE]; + int nready; + void *apidata; +} EventLoop; + +/* Lock for connection freelist */ +static pthread_mutex_t conn_lock; + +/* Lock for item buffer freelist */ +static pthread_mutex_t ibuffer_lock; + +static EventLoop loop; +static pthread_mutex_t leader; + +/* + * Pulls a conn structure from the freelist, if one is available. + */ +conn *mt_conn_from_freelist() { + conn *c; + pthread_mutex_lock(&conn_lock); + c = do_conn_from_freelist(); + pthread_mutex_unlock(&conn_lock); + return c; +} + +/* + * Adds a conn structure to the freelist. + * + * Returns 0 on success, 1 if the structure couldn't be added. + */ +bool mt_conn_add_to_freelist(conn *c) { + bool result; + + pthread_mutex_lock(&conn_lock); + result = do_conn_add_to_freelist(c); + pthread_mutex_unlock(&conn_lock); + + return result; +} + + +/******************************* GLOBAL STATS ******************************/ + +void mt_stats_lock() { +} + +void mt_stats_unlock() { +} + +/* Include the best multiplexing layer supported by this system. + * The following should be ordered by performances, descending. */ +#ifdef HAVE_EPOLL +#include "ae_epoll.c" +#else + #ifdef HAVE_KQUEUE + #include "ae_kqueue.c" + #else + #include "ae_select.c" + #endif +#endif + +/* + * Initializes the thread subsystem, creating various worker threads. + * + * nthreads Number of event handler threads to spawn + */ +void thread_init(int nthreads) { + pthread_mutex_init(&ibuffer_lock, NULL); + pthread_mutex_init(&conn_lock, NULL); + pthread_mutex_init(&leader, NULL); + + memset(&loop, 0, sizeof(loop)); + if (aeApiCreate(&loop) == -1) { + exit(1); + } +} + +int add_event(int fd, int mask, conn *c) +{ + if (fd >= AE_SETSIZE) { + fprintf(stderr, "fd is too large: %d\n", fd); + return AE_ERR; + } + assert(loop.conns[fd] == NULL); + loop.conns[fd] = c; + if (aeApiAddEvent(&loop, fd, mask) == -1){ + loop.conns[fd] = NULL; + return AE_ERR; + } +// if (fd > loop.maxfd) +// loop.maxfd = fd; + return AE_OK; +} + +int update_event(int fd, int mask, conn *c) +{ + loop.conns[fd] = c; + if (aeApiUpdateEvent(&loop, fd, mask) == -1){ + loop.conns[fd] = NULL; + return AE_ERR; + } + return AE_OK; +} + +int delete_event(int fd) +{ + if (fd >= AE_SETSIZE) return -1; + if (loop.conns[fd] == NULL) return 0; + if (aeApiDelEvent(&loop, fd) == -1) + return -1; + loop.conns[fd] = NULL; + return 0; +} + +static void *worker_main(void *arg) { + pthread_setcanceltype (PTHREAD_CANCEL_ASYNCHRONOUS, 0); + + struct timeval tv = {1, 0}; + while (!daemon_quit) { + pthread_mutex_lock(&leader); + +AGAIN: + while(loop.nready == 0 && daemon_quit == 0) + loop.nready = aeApiPoll(&loop, &tv); + if (daemon_quit) { + pthread_mutex_unlock(&leader); + break; + } + + loop.nready --; + int fd = loop.fired[loop.nready]; + conn *c = loop.conns[fd]; + if (c == NULL){ + fprintf(stderr, "Bug: conn %d should not be NULL\n", fd); + close(fd); + goto AGAIN; + } + loop.conns[fd] = NULL; + pthread_mutex_unlock(&leader); + + drive_machine(c); + if (c->ev_flags > 0) { + update_event(fd, c->ev_flags, c); + } + } + return NULL; +} + +void loop_run(int nthread) +{ + int i, ret; + pthread_attr_t attr; + pthread_attr_init(&attr); + pthread_t* tids = malloc(sizeof(pthread_t) * nthread); + + for (i=0; i<nthread - 1; i++) { + if ((ret = pthread_create(tids + i, &attr, worker_main, NULL)) != 0) { + fprintf(stderr, "Can't create thread: %s\n", + strerror(ret)); + exit(1); + } + } + + worker_main(NULL); + + // wait workers to stop + for (i=0; i<nthread - 1; i++) { + (void) pthread_join(tids[i], NULL); + } + free(tids); +} + diff --git a/libdap-cuttdb/src/test_mt.c b/libdap-cuttdb/src/test_mt.c new file mode 100644 index 0000000000000000000000000000000000000000..de4d383731a4a66bc2e690f40dfc742ec022073e --- /dev/null +++ b/libdap-cuttdb/src/test_mt.c @@ -0,0 +1,149 @@ +/* + * CuttDB - a fast key-value storage engine + * + * + * http://code.google.com/p/cuttdb/ + * + * Copyright (c) 2012, Siyuan Fu. All rights reserved. + * Use and distribution licensed under the BSD license. + * See the LICENSE file for full text + * + * Author: Siyuan Fu <fusiyuan2010@gmail.com> + * + */ + + +#include <stdio.h> +#include <stdlib.h> +#include <time.h> +#include <unistd.h> +#include <pthread.h> +#include "cuttdb.h" + + +CDB *db; + +enum { + SETOP, + GETOP, + DELOP, +}; + +#if 1 +static int prob_table1[8] = {SETOP, SETOP, SETOP, SETOP, SETOP, SETOP, DELOP, GETOP}; +static int prob_table2[8] = {SETOP, SETOP, SETOP, SETOP, SETOP, DELOP, DELOP, GETOP}; +static int prob_table3[8] = {SETOP, SETOP, SETOP, DELOP, DELOP, DELOP, DELOP, GETOP}; +#else +static int prob_table1[8] = {SETOP, SETOP, SETOP, SETOP, SETOP, SETOP, SETOP, GETOP}; +static int prob_table2[8] = {SETOP, SETOP, SETOP, SETOP, SETOP, SETOP, SETOP, GETOP}; +static int prob_table3[8] = {SETOP, SETOP, SETOP, SETOP, SETOP, SETOP, SETOP, GETOP}; +#endif +int *optable = NULL; + + +long get_rand() +{ + return (long)rand() * RAND_MAX + rand(); +} + + +void *test_thread(void *arg) +{ + char key[64]; + char value[128]; + void *v; + int knum = *(int*)arg; + while(1) { + int krand = get_rand() % knum; + int ksize = snprintf(key, 64, "%ld%ld%ld", krand, krand, krand); + int vsize = snprintf(value, 128, "%ld%ld%ld%ld%d%ld%ld%ld%ld", + krand, krand, krand, krand, krand, krand, krand, krand); + int op = optable[rand() & 0x07]; + int expire = 600 + 20 * (rand() % 1000); + switch(op) { + case SETOP: + if (cdb_set2(db, key, ksize, value, vsize, CDB_OVERWRITE | CDB_INSERTCACHE, expire) < 0) + printf("ERROR! %s:%d\n", __FILE__, __LINE__); + break; + case GETOP: + if (cdb_get(db, key, ksize, &v, &vsize) == -1) + printf("ERROR! %s:%d\n", __FILE__, __LINE__); + if (v) + cdb_free_val(&v); + break; + case DELOP: + if (cdb_del(db, key, ksize) == -1) + printf("ERROR! %s:%d\n", __FILE__, __LINE__); + break; + default: + break; + } + } +} + + + +int main(int argc, char *argv[]) +{ + int thread_num = 2; + int record_num = 10000000; + char *db_path = NULL; + printf("Usage: %s db_path [record_num] [thread_num]\n", argv[0]); + if (argc >= 2) + db_path = argv[1]; + else + return -1; + + if (argc >= 3) + record_num = atoi(argv[2]); + if (argc >= 4) + thread_num = atoi(argv[3]); + + record_num = record_num < 100? 100: record_num; + thread_num = thread_num < 1? 1: thread_num; + srand(time(NULL)); + + db = cdb_new(); + cdb_option(db, record_num / 100, 0, 1024000); + if (cdb_open(db, db_path, CDB_CREAT | CDB_TRUNC) < 0) { + printf("DB Open err\n"); + return -1; + } + + + optable = prob_table1; + pthread_t threads[thread_num]; + for(int i = 0; i < thread_num; i++) { + pthread_create(&threads[i], NULL, test_thread, &record_num); + } + + int clear_interval = 0; + while(1) { + CDBSTAT st; + cdb_stat(db, &st); + printf("rnum: %lu, rcnum: %lu, pnum: %lu, pcnum %lu, rlatcy: %u wlatcy: %u" + " rh/m: %lu/%lu ph/m: %lu/%lu\n", + st.rnum, st.rcnum, st.pnum, st.pcnum, st.rlatcy, st.wlatcy, + st.rchit, st.rcmiss, st.pchit, st.pcmiss); + if (++clear_interval % 20 == 0) + cdb_stat(db, NULL); + + if (st.rnum > 0.7 * record_num) + optable = prob_table2; + if (st.rnum > 0.9 * record_num) + optable = prob_table3; + + if (st.rnum < 0.8 * record_num) + optable = prob_table2; + + if (st.rnum < 0.6 * record_num) + optable = prob_table1; + fflush(stdout); + sleep(1); + } + + return 0; +} + + + diff --git a/libdap-cuttdb/src/vio_apnd2.c b/libdap-cuttdb/src/vio_apnd2.c new file mode 100644 index 0000000000000000000000000000000000000000..3f093a6fb55150cc0c7ac927f042a9cf0fc404aa --- /dev/null +++ b/libdap-cuttdb/src/vio_apnd2.c @@ -0,0 +1,2647 @@ +/* + * CuttDB - a fast key-value storage engine + * + * + * http://code.google.com/p/cuttdb/ + * + * Copyright (c) 2012, Siyuan Fu. All rights reserved. + * Use and distribution licensed under the BSD license. + * See the LICENSE file for full text + * + * Author: Siyuan Fu <fusiyuan2010@gmail.com> + * + */ + + +#include "vio_apnd2.h" +#include "cdb_hashtable.h" +#include "cdb_bgtask.h" +#include "cdb_lock.h" +#include "cuttdb.h" +#include "cdb_core.h" +#include "cdb_errno.h" +#include "cdb_types.h" +#include "cdb_crc64.h" +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <errno.h> +#include <time.h> +#include <sys/stat.h> +#ifdef _WIN32 +#include "mman.h" +#else +#include <sys/mman.h> +#endif +#include <fcntl.h> +#include <unistd.h> +#include <dirent.h> + +/* record magic bytes */ +#define RECMAGIC 0x19871022 +/* obsoleted, but appeared in some code */ +#define DELRECMAGIC 0x19871023 +#define PAGEMAGIC 0x19890604 + +/* data buffered before pwrite to disk */ +#define IOBUFSIZE (2 * MB) +/* structure of deletion buffer differs from the others, buffered DELBUFMAX records at most */ +#define DELBUFMAX 10000 + +/* index(page) file size limit */ +#define FIDXMAXSIZE (16 * MB) +/* data file size limit */ +#define FDATMAXSIZE (128 * MB) +/* all meta information are regulated to fix size */ +#define FILEMETASIZE 64 +/* the file opened simultaneously limit, managed by LRU */ +#define MAXFD 16384 +#define MAX_PATH_LEN 255 + +#define FILEMAGICHEADER "CuTtDbFiLePaRtIaL" +#define FILEMAGICLEN (strlen(FILEMAGICHEADER)) +/* page or data records are stored at aligned offset */ +#define ALIGNBYTES 16 + +/* virtual offset(48bits) transform into real offset(fid,offset) */ +#define VOFF2ROFF(off, fid, roff) do{fid = (off).i4 >> 8; \ + roff = ((off).i4 & 0xff) << 16; roff = (roff | (off).i2) * ALIGNBYTES;}while(0) + +/* real offset transform into virtual offset */ +#define ROFF2VOFF(fid, roff, off) do{(off).i4 = fid << 8; \ + (off).i4 |= (roff / ALIGNBYTES) >> 16; (off).i2 = (roff / ALIGNBYTES) & 0xffff;} while(0) + +/* align to a integer offset */ +#define OFFALIGNED(off) ((((off)-1) | (ALIGNBYTES - 1)) + 1) + +/* used in fd LRU-cached, distinguish index or data files' fd */ +#define VFIDIDX(fid) (fid * 2) +#define VFIDDAT(fid) (fid * 2 + 1) + +/* how often write out buffered data */ +#define FLUSHTIMEOUT 5 +/* how often to check if index file needs space recycle */ +#define RCYLEPAGEINTERVAL 60 +/* how often to check if data file needs space recycle */ +#define RCYLEDATAINTERVAL 120 +/* data file space recycle check interval factor (seconds per data file/128MB)*/ +#define DATARCYLECHECKFACTOR 1800 + + +/* three type of file */ +enum { + /* random value */ + VIOAPND2_INDEX = 0x97, + VIOAPND2_DATA = 0x98, + VIOAPND2_DELLOG = 0x99, +}; + + +/* where the record comes from when calling writerec */ +enum { + VIOAPND2_RECEXTERNAL = 0, + VIOAPND2_RECINTERNAL = 1, +}; + + +/* a file is writing or full? */ +enum { + VIOAPND2_WRITING = 0, + VIOAPND2_FULL = 1, +}; + +/* signature in the header file, indicates it's open or be safety closed */ +enum { + /* any number doens't matter */ + VIOAPND2_SIGOPEN = 2, + VIOAPND2_SIGCLOSED = 3, +}; + + +/* buffer for IO */ +typedef struct { + uint32_t limit; + uint32_t off; + uint32_t pos; + uint32_t fid; + uint64_t oid; + int fd; + char buf[IOBUFSIZE]; +} VIOAPND2IOBUF; + + +/* file information for every file */ +typedef struct VIOAPND2FINFO { + /* fid */ + uint32_t fid; + /* first oid */ + uint64_t oidf; + /* last oid */ + uint64_t oidl; + + /* next file */ + struct VIOAPND2FINFO *fnext; + /* prev file */ + struct VIOAPND2FINFO *fprev; + + uint32_t fsize; + /* junk space */ + uint32_t rcyled; + /* nearest expire time */ + uint32_t nexpire; + /* last time for recycle check */ + uint32_t lcktime; + /* index page file or data file? */ + uint8_t ftype; + /* writing or full? */ + uint8_t fstatus; + /* ref count, avoid unlink failure */ + uint32_t ref; + /* whether unlink the file after dereference */ + bool unlink; +} VIOAPND2FINFO; + + +typedef struct { + /* a new db? */ + bool create; + /* fd number limit */ + int maxfds; + /* opened files' fds cache */ + CDBHASHTABLE *fdcache; + + /* number of data file */ + uint32_t dfnum; + /* number of index file */ + uint32_t ifnum; + + /* Buffers */ + VIOAPND2IOBUF dbuf; + VIOAPND2IOBUF ibuf; + FOFF delbuf[DELBUFMAX]; + int delbufpos; + + /* db path */ + char *filepath; + + + /* file information of index files */ + CDBHASHTABLE *idxmeta; + VIOAPND2FINFO *idxfhead; + VIOAPND2FINFO *idxftail; + /* file information of data files */ + CDBHASHTABLE *datmeta; + VIOAPND2FINFO *datfhead; + VIOAPND2FINFO *datftail; + + /* fd for db header */ + int hfd; + /* fd for files meta header */ + int mfd; + /* fd for deletion log */ + int dfd; + + /* lock for all I/O operation */ + CDBLOCK *lock; + + int idxitfid; + uint32_t idxitoff; + char *idxmmap; + +} VIOAPND2; + + +/* iterator for index/data */ +typedef struct { + /* current open fd */ + int fd; + /* current offset in file*/ + uint32_t off; + /* current operation id */ + uint64_t oid; + /* current file size*/ + uint64_t fsize; + /* mapped of file */ + char *mmap; + /* reference of filemeta struct */ + VIOAPND2FINFO *finfo; +} VIOAPND2ITOR; + + +static int _vio_apnd2_open(CDBVIO *vio, const char *filepath, int flags); +static int _vio_apnd2_checkpid(CDBVIO *vio); +static int _vio_apnd2_write(CDBVIO *vio, int fd, void *buf, uint32_t size, bool aligned); +static int _vio_apnd2_read(CDBVIO *vio, int fd, void *buf, uint32_t size, uint64_t off); +static int _vio_apnd2_readmeta(CDBVIO *vio, bool overwrite); +static int _vio_apnd2_writemeta(CDBVIO *vio); +static int _vio_apnd2_close(CDBVIO *vio); +static int _vio_apnd2_writerec(CDBVIO *vio, CDBREC *rec, FOFF *off, int ptrtype); +static int _vio_apnd2_writerecexternal(CDBVIO *vio, CDBREC *rec, FOFF *off); +static int _vio_apnd2_writerecinternal(CDBVIO *vio, CDBREC *rec, FOFF *off); +static int _vio_apnd2_deleterec(CDBVIO *vio, CDBREC *rec, FOFF off); +static int _vio_apnd2_readrec(CDBVIO *vio, CDBREC** rec, FOFF off, bool readval); +static int _vio_apnd2_writepage(CDBVIO *vio, CDBPAGE *page, FOFF *off); +static int _vio_apnd2_readpage(CDBVIO *vio, CDBPAGE **page, FOFF off); +static int _vio_apnd2_sync(CDBVIO *vio); +static int _vio_apnd2_writehead2(CDBVIO *vio); +static int _vio_apnd2_writehead(CDBVIO *vio, bool wtable); +static int _vio_apnd2_readhead2(CDBVIO *vio); +static int _vio_apnd2_readhead(CDBVIO *vio, bool rtable); +static int _vio_apnd2_writefmeta(CDBVIO *vio, int fd, VIOAPND2FINFO *finfo); +static int _vio_apnd2_readfmeta(CDBVIO *vio, int fd, VIOAPND2FINFO *finfo); +static int _vio_apnd2_flushbuf(CDBVIO *vio, int dtype); +static void _vio_apnd2_flushtask(void *arg); +static void _vio_apnd2_rcyledataspacetask(void *arg); +static void _vio_apnd2_fixcachepageooff(CDB *db, uint32_t bit, FOFF off); +static void _vio_apnd2_rcylepagespacetask(void *arg); +static int _vio_apnd2_shiftnew(CDBVIO *vio, int dtype); +static int _vio_apnd2_recovery(CDBVIO *vio, bool force); +static void _vio_apnd2_unlink(CDBVIO *vio, VIOAPND2FINFO *finfo, int dtype); +static VIOAPND2FINFO* _vio_apnd2_fileiternext(CDBVIO *vio, int dtype, uint64_t oid); +static int _vio_apnd2_iterfirst(CDBVIO *vio, VIOAPND2ITOR *it, int dtype, int64_t oid); +static int _vio_apnd2_iterfree(CDBVIO *vio, int dtype, VIOAPND2ITOR *it); +static int _vio_apnd2_pageiternext(CDBVIO *vio, CDBPAGE **page, void *iter); +static int _vio_apnd2_reciternext(CDBVIO *vio, CDBREC **rec, void *iter); +static void* _vio_apnd2_reciterfirst(CDBVIO *vio, uint64_t oid); +static void* _vio_apnd2_pageiterfirst(CDBVIO *vio, uint64_t oid); +static void _vio_apnd2_reciterdestory(CDBVIO *vio, void *iter); +static void _vio_apnd2_pageiterdestory(CDBVIO *vio, void *iter); +static void _vio_apnd2_cleanpoint(CDBVIO *vio); +static int _vio_apnd2_cmpfuncsreorder(const void *p1, const void *p2); +static int _vio_apnd2_checkopensig(CDBVIO *vio); +static int _vio_apnd2_setopensig(CDBVIO *vio, int sig); +static int _vio_apnd2_rcyledatafile(CDBVIO *vio, VIOAPND2FINFO *finfo, bool rcyle); + + +/* hook the io methods */ +void vio_apnd2_init(CDBVIO *vio) +{ + vio->close = _vio_apnd2_close; + vio->open = _vio_apnd2_open; + vio->rpage = _vio_apnd2_readpage; + vio->wpage = _vio_apnd2_writepage; + vio->rrec = _vio_apnd2_readrec; + vio->drec = _vio_apnd2_deleterec; + vio->wrec = _vio_apnd2_writerecexternal; + vio->sync = _vio_apnd2_sync; + vio->rhead = _vio_apnd2_readhead2; + vio->whead = _vio_apnd2_writehead2; + vio->cleanpoint = _vio_apnd2_cleanpoint; + vio->pageitfirst = _vio_apnd2_pageiterfirst; + vio->pageitnext = _vio_apnd2_pageiternext; + vio->pageitdestroy = _vio_apnd2_pageiterdestory; + vio->recitfirst = _vio_apnd2_reciterfirst; + vio->recitnext = _vio_apnd2_reciternext; + vio->recitdestroy = _vio_apnd2_reciterdestory; +} + +/* the hash table used in VIOAPND2 need not rehash, just use the key id is OK */ +static uint32_t _directhash(const void *key, int size) +{ + return *(uint32_t*)key; +} + + +/* allocate a new VIOAPND2 object, called when open db */ +static void _vio_apnd2_new(CDBVIO *vio) +{ + VIOAPND2 *myio = (VIOAPND2 *)malloc(sizeof(VIOAPND2)); + + myio->dfnum = myio->ifnum = 0; + + myio->dbuf.fid = 0; + myio->dbuf.pos = 0; + myio->dbuf.off = 0; + myio->dbuf.oid = 0; + memset(myio->dbuf.buf, 0, IOBUFSIZE); + myio->idxfhead = NULL; + myio->idxftail = NULL; + + myio->ibuf.fid = 0; + myio->ibuf.pos = 0; + myio->ibuf.off = 0; + myio->ibuf.oid = 0; + memset(myio->ibuf.buf, 0, IOBUFSIZE); + myio->datfhead = NULL; + myio->datftail = NULL; + + myio->delbufpos = 0; + + myio->ifnum = 0; + myio->dfnum = 0; + + myio->mfd = -1; + myio->hfd = -1; + myio->dfd = -1; + + myio->fdcache = cdb_ht_new(true, _directhash); + /* the following two are look-up table, need not LRU */ + myio->idxmeta = cdb_ht_new(false, _directhash); + myio->datmeta = cdb_ht_new(false, _directhash); + + myio->lock = cdb_lock_new(CDB_LOCKMUTEX); + + myio->create = true; + myio->maxfds = MAXFD; + myio->filepath = NULL; + + vio->iometa = myio; +} + + +/* free a VIOAPND2 object, called when close db */ +static void _vio_apnd2_destroy(CDBVIO *vio) +{ + VIOAPND2 *myio = (VIOAPND2 *)vio->iometa; + cdb_ht_destroy(myio->fdcache); + cdb_ht_destroy(myio->idxmeta); + cdb_ht_destroy(myio->datmeta); + cdb_lock_destory(myio->lock); + if (myio->filepath) + free(myio->filepath); + free(myio); + vio->iometa = NULL; +} + +/* check if another process has already open the current db */ +static int _vio_apnd2_checkpid(CDBVIO *vio) +{ + VIOAPND2 *myio = (VIOAPND2 *)vio->iometa; + struct stat st; + char filename[MAX_PATH_LEN] = {0}; + char syspidpath[MAX_PATH_LEN] = {0}; + snprintf(filename, MAX_PATH_LEN, "%s/pid.cdb", myio->filepath); + + if (stat(filename, &st) == 0) { + /* pid file exist */ + FILE *f = fopen(filename, "rt"); + int pid = -1; + if (f == NULL) { + cdb_seterrno(vio->db, CDB_OPENERR, __FILE__, __LINE__); + return -1; + } + + int ret = fscanf(f, "%d", &pid); + fclose(f); + if (ret != 1) { + cdb_seterrno(vio->db, CDB_PIDEXIST, __FILE__, __LINE__); + return -1; + } + + /* check if the process still alive */ + snprintf(syspidpath, MAX_PATH_LEN, "/proc/%d", pid); + if (stat(syspidpath, &st) == 0) { + cdb_seterrno(vio->db, CDB_PIDEXIST, __FILE__, __LINE__); + return -1; + } + } + + /* pid file non-exist or obsoleted */ + FILE *f = fopen(filename, "wt"); + if (f == NULL) { + cdb_seterrno(vio->db, CDB_OPENERR, __FILE__, __LINE__); + return -1; + } + fprintf(f, "%d\n", getpid()); + fclose(f); + return 0; +} + +/* open an db by path and mode */ +static int _vio_apnd2_open(CDBVIO *vio, const char *filepath, int flags) +{ + int rflags = O_RDWR; + char filename[MAX_PATH_LEN] = {0}; + int fsize; + int sigstatus; + VIOAPND2 *myio; + + _vio_apnd2_new(vio); + myio = (VIOAPND2 *)vio->iometa; + myio->filepath = strdup(filepath); + + if (flags & CDB_CREAT) + rflags |= O_CREAT; + if (flags & CDB_TRUNC) + rflags |= O_TRUNC; + + if (_vio_apnd2_checkpid(vio) < 0) { + goto ERRRET; + } + + snprintf(filename, MAX_PATH_LEN, "%s/mainindex.cdb", myio->filepath); + myio->hfd = open(filename, rflags, 0644); + if (myio->hfd < 0 && errno == ENOENT && (rflags & O_CREAT)) { + /* try to create, but path not exists */ + cdb_seterrno(vio->db, CDB_DIRNOEXIST, __FILE__, __LINE__); + goto ERRRET; + } else if (myio->hfd < 0) { + /* other open error */ + cdb_seterrno(vio->db, CDB_OPENERR, __FILE__, __LINE__); + goto ERRRET; + } + + fsize = lseek(myio->hfd, 0, SEEK_END); + if (fsize) { + myio->create = false; + sigstatus = _vio_apnd2_checkopensig(vio); + if (sigstatus < 0) { + /* main table read error */ + cdb_seterrno(vio->db, CDB_READERR, __FILE__, __LINE__); + goto ERRRET; + } + } else { + sigstatus = VIOAPND2_SIGCLOSED; + } + + /* */ + struct stat st; + snprintf(filename, MAX_PATH_LEN, "%s/force_recovery", myio->filepath); + if (stat(filename, &st) == 0) { + /* special file exist, force recovery to fix the database */ + _vio_apnd2_recovery(vio, true); + unlink(filename); + } else if (sigstatus == VIOAPND2_SIGOPEN) { + /* didn't properly closed last time */ + _vio_apnd2_recovery(vio, false); + } else if (sigstatus != VIOAPND2_SIGCLOSED) { + cdb_seterrno(vio->db, CDB_DATAERRMETA, __FILE__, __LINE__); + goto ERRRET; + } + + if (_vio_apnd2_setopensig(vio, VIOAPND2_SIGOPEN) < 0) { + cdb_seterrno(vio->db, CDB_WRITEERR, __FILE__, __LINE__); + goto ERRRET; + } + + snprintf(filename, MAX_PATH_LEN, "%s/mainmeta.cdb", myio->filepath); + myio->mfd = open(filename, rflags, 0644); + if (myio->mfd < 0) { + cdb_seterrno(vio->db, CDB_OPENERR, __FILE__, __LINE__); + goto ERRRET; + } + + fsize = lseek(myio->mfd, 0, SEEK_END); + if (fsize) { + /* exist database */ + _vio_apnd2_readmeta(vio, false); + + /* open current data file and index file for buffer */ + snprintf(filename, MAX_PATH_LEN, "%s/idx%08d.cdb", myio->filepath, myio->ibuf.fid); + myio->ibuf.fd = open(filename, rflags, 0644); + myio->ibuf.limit = CDBMIN(IOBUFSIZE, FIDXMAXSIZE - myio->ibuf.off); + myio->ibuf.pos = 0; + + snprintf(filename, MAX_PATH_LEN, "%s/dat%08d.cdb", myio->filepath, myio->dbuf.fid); + myio->dbuf.fd = open(filename, rflags, 0644); + myio->dbuf.limit = CDBMIN(IOBUFSIZE, FDATMAXSIZE - myio->dbuf.off); + myio->dbuf.pos = 0; + } else { + /* new database */ + myio->create = true; + /* remember the bnum */ + _vio_apnd2_writehead(vio, false); + _vio_apnd2_shiftnew(vio, VIOAPND2_INDEX); + _vio_apnd2_shiftnew(vio, VIOAPND2_DATA); + } + + snprintf(filename, MAX_PATH_LEN, "%s/dellog.cdb", myio->filepath); + myio->dfd = open(filename, O_RDWR | O_CREAT | O_TRUNC, 0644); + if (myio->dfd < 0) { + cdb_seterrno(vio->db, CDB_OPENERR, __FILE__, __LINE__); + goto ERRRET; + } + + /* set background tasks, flush buffer and recycle space */ + cdb_bgtask_add(vio->db->bgtask, _vio_apnd2_flushtask, vio, FLUSHTIMEOUT); + cdb_bgtask_add(vio->db->bgtask, _vio_apnd2_rcylepagespacetask, vio, RCYLEPAGEINTERVAL); + cdb_bgtask_add(vio->db->bgtask, _vio_apnd2_rcyledataspacetask, vio, RCYLEDATAINTERVAL); + return 0; + +ERRRET: + if (myio->mfd > 0) + close(myio->mfd); + if (myio->hfd > 0) + close(myio->hfd); + if (myio->dfd > 0) + close(myio->dfd); + _vio_apnd2_destroy(vio); + return -1; +} + + +/* task for flush buffer */ +static void _vio_apnd2_flushtask(void *arg) +{ + CDBVIO *vio = (CDBVIO *)arg; + VIOAPND2 *myio = (VIOAPND2 *)vio->iometa; + cdb_lock_lock(myio->lock); + _vio_apnd2_flushbuf(vio, VIOAPND2_DATA); + _vio_apnd2_flushbuf(vio, VIOAPND2_INDEX); + _vio_apnd2_flushbuf(vio, VIOAPND2_DELLOG); + cdb_lock_unlock(myio->lock); +} + + +/* read information for db files, 'overwrite' indicates recovery */ +static int _vio_apnd2_readmeta(CDBVIO *vio, bool overwrite) +{ + VIOAPND2 *myio = (VIOAPND2 *)vio->iometa; + char buf[FILEMETASIZE]; + char *hbuf; + int hbufsize; + int pos = 0; + + if (pread(myio->mfd, buf, FILEMETASIZE, 0) != FILEMETASIZE) { + if (overwrite) + return 0; + cdb_seterrno(vio->db, CDB_READERR, __FILE__, __LINE__); + return -1; + } + + if (memcmp(buf, FILEMAGICHEADER, FILEMAGICLEN) != 0) { + cdb_seterrno(vio->db, CDB_DATAERRMETA, __FILE__, __LINE__); + return -1; + } + + pos += FILEMAGICLEN; + cdb_lock_lock(myio->lock); + if (!overwrite) + myio->ibuf.off = *(uint32_t*)(buf + pos); + pos += SI4; + myio->ibuf.limit = *(uint32_t*)(buf + pos); + pos += SI4; + if (!overwrite) + myio->dbuf.off = *(uint32_t*)(buf + pos); + pos += SI4; + myio->dbuf.limit = *(uint32_t*)(buf + pos); + pos += SI4; + if (!overwrite) + myio->ifnum = *(uint32_t*)(buf + pos); + pos += SI4; + if (!overwrite) + myio->dfnum = *(uint32_t*)(buf + pos); + pos += SI4; + if (!overwrite) + myio->ibuf.fid = *(uint32_t*)(buf + pos); + pos += SI4; + if (!overwrite) + myio->dbuf.fid = *(uint32_t*)(buf + pos); + pos += SI4; + + hbufsize = (SI4 + SI4 + SI4 + SI8 + SI8 + 1 + 1) * myio->ifnum; + hbufsize += (SI4 + SI4 + SI4 + SI4 + SI8 + SI8 + 1 + 1) * myio->dfnum; + hbuf = (char*)malloc(hbufsize); + pos = 0; + + if (pread(myio->mfd, hbuf, hbufsize, FILEMETASIZE) != hbufsize) { + cdb_lock_unlock(myio->lock); + free(hbuf); + if (overwrite) + return 0; + cdb_seterrno(vio->db, CDB_READERR, __FILE__, __LINE__); + return -1; + } + + for(int i = 0; i < myio->ifnum; i++) { + VIOAPND2FINFO finfo, *finfo2; + finfo.fid = *(uint32_t*)(hbuf + pos); + pos += SI4; + finfo.fsize = *(uint32_t*)(hbuf + pos); + pos += SI4; + finfo.rcyled = *(uint32_t*)(hbuf + pos); + pos += SI4;; + finfo.oidf = *(uint64_t*)(hbuf + pos); + pos += SI8; + finfo.oidl = *(uint64_t*)(hbuf + pos); + pos += SI8; + finfo.fstatus = *(uint8_t*)(hbuf + pos); + pos += 1; + finfo.ftype = *(uint8_t*)(hbuf + pos); + pos += 1; + finfo.ref = 0; + finfo.unlink = false; + if (overwrite) { + /* in recovery mode only fix 'recycled size' */ + /* But do nothing with index files */ + continue; + } + finfo2 = (VIOAPND2FINFO *)cdb_ht_insert2(myio->idxmeta, &finfo.fid, SI4, &finfo, sizeof(finfo)); + if (myio->idxfhead) { + finfo2->fprev = myio->idxftail; + myio->idxftail->fnext = finfo2; + finfo2->fnext = NULL; + myio->idxftail = finfo2; + } else { + myio->idxfhead = myio->idxftail = finfo2; + finfo2->fprev = finfo2->fnext = NULL; + } + } + + for(int i = 0; i < myio->dfnum; i++) { + VIOAPND2FINFO finfo, *finfo2; + finfo.fid = *(uint32_t*)(hbuf + pos); + pos += SI4; + finfo.fsize = *(uint32_t*)(hbuf + pos); + pos += SI4; + finfo.rcyled = *(uint32_t*)(hbuf + pos); + pos += SI4; + finfo.nexpire = *(uint32_t*)(hbuf + pos); + pos += SI4; + finfo.oidf = *(uint64_t*)(hbuf + pos); + pos += SI8; + finfo.oidl = *(uint64_t*)(hbuf + pos); + pos += SI8; + finfo.fstatus = *(uint8_t*)(hbuf + pos); + pos += 1; + finfo.ftype = *(uint8_t*)(hbuf + pos); + pos += 1; + finfo.ref = 0; + finfo.unlink = false; + finfo.lcktime = time(NULL); + if (overwrite) { + /* in recovery mode only fix 'recycled size' */ + finfo2 = (VIOAPND2FINFO *)cdb_ht_get2(myio->datmeta, &finfo.fid, SI4, false); + if (finfo2) { + finfo2->rcyled = finfo.rcyled; + finfo2->nexpire = finfo.nexpire; + } + continue; + } + finfo2 = (VIOAPND2FINFO *)cdb_ht_insert2(myio->datmeta, &finfo.fid, SI4, &finfo, sizeof(finfo)); + if (myio->datfhead) { + finfo2->fprev = myio->datftail; + myio->datftail->fnext = finfo2; + finfo2->fnext = NULL; + myio->datftail = finfo2; + } else { + myio->datfhead = myio->datftail = finfo2; + finfo2->fprev = finfo2->fnext = NULL; + } + } + cdb_lock_unlock(myio->lock); + free(hbuf); + + return 0; +} + + +/* flush i/o buffer */ +static int _vio_apnd2_flushbuf(CDBVIO *vio, int dtype) +{ + VIOAPND2 *myio = (VIOAPND2 *)vio->iometa; + VIOAPND2FINFO *finfo; + VIOAPND2IOBUF *iobuf; + CDBHASHTABLE *ht; + uint32_t *fid; + uint32_t fsizemax; + + /* link to the proper operation object */ + if (dtype == VIOAPND2_INDEX) { + iobuf = &myio->ibuf; + ht = myio->idxmeta; + fsizemax = FIDXMAXSIZE; + } else if (dtype == VIOAPND2_DATA) { + iobuf = &myio->dbuf; + ht = myio->datmeta; + fsizemax = FDATMAXSIZE; + } else if (dtype == VIOAPND2_DELLOG) { + /* buffer for deletion is special */ + if (myio->delbufpos == 0) + return 0; + if (write(myio->dfd, myio->delbuf, sizeof(FOFF) * myio->delbufpos) + != sizeof(FOFF) * myio->delbufpos) { + cdb_seterrno(vio->db, CDB_WRITEERR, __FILE__, __LINE__); + return -1; + } + myio->delbufpos = 0; + return 0; + } else { + cdb_seterrno(vio->db, CDB_INTERNALERR, __FILE__, __LINE__); + return -1; + } + fid = &iobuf->fid; + + /* get information from table */ + finfo = (VIOAPND2FINFO *)cdb_ht_get2(ht, fid, SI4, false); + if (finfo == NULL) { + cdb_seterrno(vio->db, CDB_INTERNALERR, __FILE__, __LINE__); + return -1; + } + + /* write out if buffered */ + if (iobuf->pos > 0) { + if (pwrite(iobuf->fd, iobuf->buf, iobuf->pos, iobuf->off) != iobuf->pos) { + /* to avoid compile warning */ + if (ftruncate(iobuf->fd, iobuf->off) < 0) ; + cdb_seterrno(vio->db, CDB_WRITEERR, __FILE__, __LINE__); + return -1; + } + } + + /* mark the operation id */ + finfo->oidl = iobuf->oid; + + /* reset the buffer information */ + iobuf->pos = 0; + iobuf->off = lseek(iobuf->fd, 0, SEEK_END); + /* fix file size info whenever possible */ + finfo->fsize = iobuf->off; + iobuf->off = OFFALIGNED(iobuf->off); + + /* current writing file nearly full? open a new one */ + if (iobuf->off > fsizemax - 16 * KB) { + finfo->fstatus = VIOAPND2_FULL; + _vio_apnd2_writefmeta(vio, iobuf->fd, finfo); + close(iobuf->fd); + _vio_apnd2_shiftnew(vio, dtype); + } else + iobuf->limit = CDBMIN(IOBUFSIZE, fsizemax - iobuf->off) ; + + return 0; +} + +/* create a new file for buffer and writing */ +static int _vio_apnd2_shiftnew(CDBVIO *vio, int dtype) +{ + VIOAPND2 *myio = (VIOAPND2*)vio->iometa; + VIOAPND2IOBUF *iobuf; + CDBHASHTABLE *ht; + uint32_t *fnum; + uint32_t tryiter, curfid; + char filename[MAX_PATH_LEN]; + char ipfx[] = "idx"; + char dpfx[] = "dat"; + char *pfx; + + /* link to proper object by dtype */ + if (dtype == VIOAPND2_INDEX) { + iobuf = &myio->ibuf; + ht = myio->idxmeta; + fnum = &myio->ifnum; + pfx = ipfx; + } else if (dtype == VIOAPND2_DATA) { + iobuf = &myio->dbuf; + ht = myio->datmeta; + fnum = &myio->dfnum; + pfx = dpfx; + } else { + cdb_seterrno(vio->db, CDB_INTERNALERR, __FILE__, __LINE__); + return -1; + } + + curfid = iobuf->fid; + + /* reset invalid buffer, prevent for misuse */ + iobuf->fd = -1; + iobuf->fid = 0xffffff; + iobuf->limit = iobuf->pos = iobuf->off = 0xffffffff; + + /* find a valid fid, try 16M times at most */ + tryiter = 0; + while(cdb_ht_exist(ht, &curfid, SI4)) { + curfid++; + tryiter++; + if (tryiter == 0xffffff) { + cdb_seterrno(vio->db, CDB_NOFID, __FILE__, __LINE__); + return -1; + } + if (curfid == 0xffffff) + curfid = 0; + } + + /* open new file */ + snprintf(filename, MAX_PATH_LEN, "%s/%s%08d.cdb", myio->filepath, pfx, curfid); + iobuf->fd = open(filename, O_RDWR | O_CREAT | O_TRUNC, 0644); + if (iobuf->fd < 0) { + cdb_seterrno(vio->db, CDB_OPENERR, __FILE__, __LINE__); + return -1; + } + iobuf->limit = IOBUFSIZE; + iobuf->fid = curfid; + iobuf->off = FILEMETASIZE; + iobuf->pos = 0; + + /* set meta information for new file */ + VIOAPND2FINFO finfo, *finfo2; + finfo.fsize = lseek(iobuf->fd, 0, SEEK_END); + finfo.oidf = iobuf->oid; + finfo.oidl = iobuf->oid; + finfo.rcyled = 0; + finfo.lcktime = time(NULL); + finfo.fstatus = VIOAPND2_WRITING; + finfo.ftype = dtype; + finfo.fid = curfid; + finfo.unlink = false; + finfo.nexpire = 0xffffffff; + finfo.ref = 0; + /* meta information also be written to disk immediately */ + if (_vio_apnd2_writefmeta(vio, iobuf->fd, &finfo) < 0) { + close(iobuf->fd); + iobuf->fd = -1; + iobuf->fid = 0xffffff; + iobuf->limit = iobuf->pos = iobuf->off = 0xffffffff; + return -1; + } + (*fnum)++; + finfo2 = cdb_ht_insert2(ht, &curfid, SI4, &finfo, sizeof(VIOAPND2FINFO)); + if (dtype == VIOAPND2_INDEX) { + if (myio->idxfhead) { + finfo2->fprev = myio->idxftail; + myio->idxftail->fnext = finfo2; + finfo2->fnext = NULL; + myio->idxftail = finfo2; + } else { + myio->idxfhead = myio->idxftail = finfo2; + finfo2->fprev = finfo2->fnext = NULL; + } + } else if (dtype == VIOAPND2_DATA) { + if (myio->datfhead) { + finfo2->fprev = myio->datftail; + myio->datftail->fnext = finfo2; + finfo2->fnext = NULL; + myio->datftail = finfo2; + } else { + myio->datfhead = myio->datftail = finfo2; + finfo2->fprev = finfo2->fnext = NULL; + } + } + + return 0; +} + + +/* write a single file's meta information */ +static int _vio_apnd2_writefmeta(CDBVIO *vio, int fd, VIOAPND2FINFO *finfo) +{ + char buf[FILEMETASIZE]; + int pos = 0; + + memset(buf, 'X', FILEMETASIZE); + memcpy(buf, FILEMAGICHEADER, FILEMAGICLEN); + pos += FILEMAGICLEN; + *(uint64_t*)(buf + pos) = finfo->oidf; + pos += SI8; + *(uint64_t*)(buf + pos) = finfo->oidl; + pos += SI8; + *(uint32_t*)(buf + pos) = finfo->fsize; + pos += SI4; + *(uint32_t*)(buf + pos) = finfo->fid; + pos += SI4; + *(uint8_t*)(buf + pos) = finfo->fstatus; + pos++; + *(uint8_t*)(buf + pos) = finfo->ftype; + pos++; + + if (pwrite(fd, buf, FILEMETASIZE, 0) != FILEMETASIZE) { + cdb_seterrno(vio->db, CDB_WRITEERR, __FILE__, __LINE__); + return -1; + } + return 0; +} + +/* read a single file's meta information */ +static int _vio_apnd2_readfmeta(CDBVIO *vio, int fd, VIOAPND2FINFO *finfo) +{ + char buf[FILEMETASIZE]; + int pos = 0; + + memset(buf, 'X', FILEMETASIZE); + if (pread(fd, buf, FILEMETASIZE, 0) != FILEMETASIZE) { + cdb_seterrno(vio->db, CDB_READERR, __FILE__, __LINE__); + return -1; + } + + if (memcmp(buf, FILEMAGICHEADER, FILEMAGICLEN)) { + cdb_seterrno(vio->db, CDB_DATAERRMETA, __FILE__, __LINE__); + return -1; + } + + pos += FILEMAGICLEN; + finfo->oidf = *(uint64_t*)(buf + pos); + pos += SI8; + finfo->oidl = *(uint64_t*)(buf + pos); + pos += SI8; + finfo->fsize = *(uint32_t*)(buf + pos); + pos += SI4; + finfo->fid = *(uint32_t*)(buf + pos); + pos += SI4; + finfo->fstatus = *(uint8_t*)(buf + pos); + pos++; + finfo->ftype = *(uint8_t*)(buf + pos); + pos++; + return 0; +} + + +/* write to disk directly instead of using buffer(Only Appends) */ +static int _vio_apnd2_write(CDBVIO *vio, int fd, void *buf, uint32_t size, bool aligned) +{ + VIOAPND2 *myio = (VIOAPND2 *)vio->iometa; + uint32_t off; + + if (size == 0) + return 0; + + off = lseek(fd, 0, SEEK_END); + if (aligned) + off = OFFALIGNED(off); + if (pwrite(fd, buf, size, off) != size) { + /* to avoid compile warning */ + if (ftruncate(myio->ibuf.fd, off) < 0) ; + cdb_seterrno(vio->db, CDB_WRITEERR, __FILE__, __LINE__); + return -1; + } + + return size; +} + + +/* read from disk; if data has not been written, read from buffer */ +static int _vio_apnd2_read(CDBVIO *vio, int fd, void *buf, uint32_t size, uint64_t off) +{ + VIOAPND2 *myio = (VIOAPND2 *)vio->iometa; + int ret; + + /* in buffer? */ + if (fd == myio->dbuf.fd && off >= myio->dbuf.off) { + uint64_t boff = off - myio->dbuf.off; + ret = CDBMIN(size, myio->dbuf.pos - boff); + memcpy(buf, myio->dbuf.buf + boff, ret); + } else if (fd == myio->ibuf.fd && off >= myio->ibuf.off) { + uint64_t boff = off - myio->ibuf.off; + ret = CDBMIN(size, myio->ibuf.pos - boff); + memcpy(buf, myio->ibuf.buf + boff, ret); + } else { + /* not in buffer */ + ret = pread(fd, buf, size, off); + if (ret < 0) { + cdb_seterrno(vio->db, CDB_READERR, __FILE__, __LINE__); + return -1; + } + } + return ret; +} + + +/* write all files meta information into a file */ +static int _vio_apnd2_writemeta(CDBVIO *vio) +{ + VIOAPND2 *myio = (VIOAPND2 *)vio->iometa; + char buf[FILEMETASIZE]; + char *hbuf; + int hbufsize; + int pos = 0; + + memset(buf, 'X', FILEMETASIZE); + memcpy(buf, FILEMAGICHEADER, FILEMAGICLEN); + pos += FILEMAGICLEN; + cdb_lock_lock(myio->lock); + *(uint32_t*)(buf + pos) = myio->ibuf.off; + pos += SI4; + *(uint32_t*)(buf + pos) = myio->ibuf.limit; + pos += SI4; + *(uint32_t*)(buf + pos) = myio->dbuf.off; + pos += SI4; + *(uint32_t*)(buf + pos) = myio->dbuf.limit; + pos += SI4; + *(uint32_t*)(buf + pos) = myio->ifnum; + pos += SI4; + *(uint32_t*)(buf + pos) = myio->dfnum; + pos += SI4; + *(uint32_t*)(buf + pos) = myio->ibuf.fid; + pos += SI4; + *(uint32_t*)(buf + pos) = myio->dbuf.fid; + pos += SI4; + + hbufsize = (SI4 + SI4 + SI4 + SI8 + SI8 + 1 + 1) * myio->ifnum; + hbufsize += (SI4 + SI4 + SI4 + SI4 + SI8 + SI8 + 1 + 1) * myio->dfnum; + hbuf = (char*)malloc(hbufsize); + memset(hbuf, 'X', hbufsize); + pos = 0; + /* iterate all the index files order by oid */ + VIOAPND2FINFO *finfo = myio->idxfhead; + while(finfo != NULL) { + *(uint32_t*)(hbuf + pos) = finfo->fid; + pos += 4; + *(uint32_t*)(hbuf + pos) = finfo->fsize; + pos += 4; + *(uint32_t*)(hbuf + pos) = finfo->rcyled; + pos += 4; + *(uint64_t*)(hbuf + pos) = finfo->oidf; + pos += 8; + *(uint64_t*)(hbuf + pos) = finfo->oidl; + pos += 8; + *(uint8_t*)(hbuf + pos) = finfo->fstatus; + pos += 1; + *(uint8_t*)(hbuf + pos) = finfo->ftype; + pos += 1; + finfo = finfo->fnext; + } + + /* iterate all the data files order by oid */ + finfo = myio->datfhead; + while(finfo != NULL) { + *(uint32_t*)(hbuf + pos) = finfo->fid; + pos += 4; + *(uint32_t*)(hbuf + pos) = finfo->fsize; + pos += 4; + *(uint32_t*)(hbuf + pos) = finfo->rcyled; + pos += 4; + *(uint32_t*)(hbuf + pos) = finfo->nexpire; + pos += 4; + *(uint64_t*)(hbuf + pos) = finfo->oidf; + pos += 8; + *(uint64_t*)(hbuf + pos) = finfo->oidl; + pos += 8; + *(uint8_t*)(hbuf + pos) = finfo->fstatus; + pos += 1; + *(uint8_t*)(hbuf + pos) = finfo->ftype; + pos += 1; + finfo = finfo->fnext; + } + cdb_lock_unlock(myio->lock); + + if (pwrite(myio->mfd, buf, FILEMETASIZE, 0) != FILEMETASIZE) { + cdb_seterrno(vio->db, CDB_WRITEERR, __FILE__, __LINE__); + free(hbuf); + return -1; + } + + if (pwrite(myio->mfd, hbuf, hbufsize, FILEMETASIZE) != hbufsize) { + cdb_seterrno(vio->db, CDB_WRITEERR, __FILE__, __LINE__); + free(hbuf); + return -1; + } + free(hbuf); + + return 0; +} + + +/* close db */ +static int _vio_apnd2_close(CDBVIO *vio) +{ + VIOAPND2 *myio = (VIOAPND2 *)vio->iometa; + CDBHTITEM *item; + char filename[MAX_PATH_LEN] = {0}; + VIOAPND2FINFO *finfo; + + /* flush buffer */ + _vio_apnd2_flushbuf(vio, VIOAPND2_INDEX); + finfo = (VIOAPND2FINFO *)cdb_ht_get2(myio->idxmeta, &myio->ibuf.fid, SI4, false); + if (finfo) + _vio_apnd2_writefmeta(vio, myio->ibuf.fd, finfo); + _vio_apnd2_flushbuf(vio, VIOAPND2_DATA); + finfo = (VIOAPND2FINFO *)cdb_ht_get2(myio->datmeta, &myio->dbuf.fid, SI4, false); + if (finfo) + _vio_apnd2_writefmeta(vio, myio->dbuf.fd, finfo); + + /* iterate and close the fd cache */ + item = cdb_ht_iterbegin(myio->fdcache); + while(item != NULL) { + close(*(int*)cdb_ht_itemval(myio->fdcache, item)); + item = cdb_ht_iternext(myio->fdcache, item); + } + + if (myio->dbuf.fd > 0) + close(myio->dbuf.fd); + if (myio->ibuf.fd > 0) + close(myio->ibuf.fd); + + /* rewrite the metafile */ + _vio_apnd2_writemeta(vio); + /* close all open files */ + snprintf(filename, MAX_PATH_LEN, "%s/pid.cdb", myio->filepath); + unlink(filename); + /* dellog only be useful for recovery of database unsafety close */ + snprintf(filename, MAX_PATH_LEN, "%s/dellog.cdb", myio->filepath); + unlink(filename); + _vio_apnd2_setopensig(vio, VIOAPND2_SIGCLOSED); + if (myio->hfd > 0) + close(myio->hfd); + if (myio->mfd > 0) + close(myio->mfd); + if (myio->dfd > 0) + close(myio->dfd); + _vio_apnd2_destroy(vio); + return 0; +} + + +/* open a file, and remember its fd. The function runs under lock protection */ +static int _vio_apnd2_loadfd(CDBVIO *vio, uint32_t fid, int dtype) +{ + VIOAPND2 *myio = (VIOAPND2*)vio->iometa; + int fd; + char filename[MAX_PATH_LEN]; + char ipfx[] = "idx"; + char dpfx[] = "dat"; + char *pfx; + uint32_t vfid; + + if (dtype == VIOAPND2_INDEX) { + pfx = ipfx; + vfid = VFIDIDX(fid); + } else if (dtype == VIOAPND2_DATA) { + pfx = dpfx; + vfid = VFIDDAT(fid); + } else { + cdb_seterrno(vio->db, CDB_INTERNALERR, __FILE__, __LINE__); + return -1; + } + + snprintf(filename, MAX_PATH_LEN, "%s/%s%08d.cdb", myio->filepath, pfx, fid); + fd = open(filename, O_RDONLY, 0644); + if (fd < 0) { + cdb_seterrno(vio->db, CDB_OPENERR, __FILE__, __LINE__); + return -1; + } + + /* cache the fd, close the oldest file not touched */ + cdb_ht_insert2(myio->fdcache, &vfid, SI4, &fd, sizeof(int)); + while(myio->fdcache->num > myio->maxfds) { + CDBHTITEM *item = cdb_ht_poptail(myio->fdcache); + close(*(int*)cdb_ht_itemval(myio->fdcache, item)); + free(item); + } + + return fd; +} + +/* read a index page */ +static int _vio_apnd2_readpage(CDBVIO *vio, CDBPAGE **page, FOFF off) +{ + VIOAPND2 *myio = (VIOAPND2*)vio->iometa; + int ret, fd; + uint32_t psize; + uint32_t fid, roff; + uint32_t fixbufsize = SBUFSIZE - (sizeof(CDBPAGE) - PAGEHSIZE); + uint32_t areadsize = PAGEAREADSIZE; //vio->db->areadsize; + + VOFF2ROFF(off, fid, roff); + /* avoid dirty memory */ + (*page)->magic = 0; + + cdb_lock_lock(myio->lock); + if (fid == myio->ibuf.fid) + /* read from current writing file? */ + fd = myio->ibuf.fd; + else { + /* old index file */ + int vfid, *fdret; + vfid = VFIDIDX(fid); + /* in cache? */ + fdret = cdb_ht_get2(myio->fdcache, &vfid, sizeof(vfid), true); + if (fdret == NULL) { + fd = _vio_apnd2_loadfd(vio, fid, VIOAPND2_INDEX); + if (fd < 0) { + cdb_lock_unlock(myio->lock); + return -1; + } + } else + fd = *fdret; + } + + /* NOTICE: the data on disk actually starts at 'magic' field in structure */ + ret = _vio_apnd2_read(vio, fd, &(*page)->magic, areadsize, roff); + if (ret <= 0) { + cdb_lock_unlock(myio->lock); + return -1; + } + + if ((*page)->magic != PAGEMAGIC) { + cdb_lock_unlock(myio->lock); + cdb_seterrno(vio->db, CDB_DATAERRIDX, __FILE__, __LINE__); + return -1; + } + + psize = PAGESIZE(*page); + if (ret < areadsize && ret < psize) { + cdb_lock_unlock(myio->lock); + cdb_seterrno(vio->db, CDB_DATAERRIDX, __FILE__, __LINE__); + return ret; + } else if (psize > areadsize) { + /* need another read operation since the page is a large than default read size */ + if (psize > fixbufsize) { + /* record is larger the stack size */ + CDBPAGE *npage = (CDBPAGE *)malloc(sizeof(CDBPAGE) + (*page)->num * sizeof(PITEM)); + memcpy(&npage->magic, &(*page)->magic, areadsize); + *page = npage; + } + + ret = _vio_apnd2_read(vio, fd, (char*)&(*page)->magic + areadsize, + psize - areadsize, roff + areadsize); + if (ret < psize - areadsize) { + cdb_lock_unlock(myio->lock); + cdb_seterrno(vio->db, CDB_DATAERRIDX, __FILE__, __LINE__); + return -1; + } + } + + cdb_lock_unlock(myio->lock); + + /* remember where i got the page, calculate into junk space if page is discarded */ + (*page)->osize = OFFALIGNED(psize); + (*page)->ooff = off; + (*page)->cap = (*page)->num; + return 0; +} + +/* read a data record */ +static int _vio_apnd2_readrec(CDBVIO *vio, CDBREC** rec, FOFF off, bool readval) +{ + VIOAPND2 *myio = (VIOAPND2*)vio->iometa; + int ret, fd; + uint32_t rsize; + uint32_t fid, roff; + /* the 'rec' is hoped to be fit in stack, the actually size is a little smaller */ + /* because some fields in CDBREC structure are not on disk */ + uint32_t fixbufsize = SBUFSIZE - (sizeof(CDBREC) - RECHSIZE); + uint32_t areadsize = vio->db->areadsize; + + VOFF2ROFF(off, fid, roff); + /* avoid dirty memory */ + (*rec)->magic = 0; + + cdb_lock_lock(myio->lock); + if (fid == myio->dbuf.fid) + /* read from current writing file? */ + fd = myio->dbuf.fd; + else { + /* read from old data file */ + int vfid, *fdret; + vfid = VFIDDAT(fid); + fdret = cdb_ht_get2(myio->fdcache, &vfid, sizeof(vfid), true); + if (fdret == NULL) { + fd = _vio_apnd2_loadfd(vio, fid, VIOAPND2_DATA); + if (fd < 0) { + cdb_lock_unlock(myio->lock); + return -1; + } + } else + fd = *fdret; + } + + /* NOTICE: the data on disk actually starts at 'magic' field in structure */ + ret = _vio_apnd2_read(vio, fd, &(*rec)->magic, areadsize, roff); + if (ret <= 0) { + cdb_lock_unlock(myio->lock); + return -1; + } + + if ((*rec)->magic != RECMAGIC) { + cdb_lock_unlock(myio->lock); + cdb_seterrno(vio->db, CDB_DATAERRDAT, __FILE__, __LINE__); + return -1; + } + + uint32_t ovsize = (*rec)->vsize; + if (!readval) + /* read key only */ + (*rec)->vsize = 0; + rsize = RECSIZE(*rec); + + if (ret < areadsize && ret < rsize) { + cdb_lock_unlock(myio->lock); + cdb_seterrno(vio->db, CDB_DATAERRDAT, __FILE__, __LINE__); + return -1; + } else if (rsize > areadsize) { + /* need another read */ + if (rsize > fixbufsize) { + /* record is larger the stack size */ + CDBREC *nrec = (CDBREC *)malloc(sizeof(CDBREC)+(*rec)->ksize+(*rec)->vsize); + memcpy(&nrec->magic, &(*rec)->magic, areadsize); + *rec = nrec; + } + ret = _vio_apnd2_read(vio, fd, (char*)&(*rec)->magic + areadsize, + rsize - areadsize, roff + areadsize); + if (ret != rsize - areadsize) { + cdb_lock_unlock(myio->lock); + cdb_seterrno(vio->db, CDB_DATAERRDAT, __FILE__, __LINE__); + return -1; + } + } + cdb_lock_unlock(myio->lock); + + /* fix pointer */ + (*rec)->key = (*rec)->buf; + (*rec)->val = (*rec)->buf + (*rec)->ksize; + + /* even if didn't read the value, still keep the complete (old) size */ + if (!readval) + (*rec)->osize = OFFALIGNED(rsize + ovsize); + else + (*rec)->osize = OFFALIGNED(rsize); + + (*rec)->ooff = off; + return 0; +} + + +/* write a index page, return the written virtual offset */ +static int _vio_apnd2_writepage(CDBVIO *vio, CDBPAGE *page, FOFF *off) +{ + VIOAPND2 *myio = (VIOAPND2*)vio->iometa; + VIOAPND2FINFO *finfo; + uint32_t psize = PAGESIZE(page); + uint32_t fid, roff; + uint32_t ofid; + + page->magic = PAGEMAGIC; + page->oid = cdb_genoid(vio->db); + + cdb_lock_lock(myio->lock); + /* buffer ready? */ + if (myio->ibuf.fd < 0) { + if (_vio_apnd2_shiftnew(vio, VIOAPND2_INDEX) < 0) { + cdb_lock_unlock(myio->lock); + return -1; + } + } + + /* if it was modified from existing page, remember the wasted space */ + if (OFFNOTNULL(page->ooff)) { + VOFF2ROFF(page->ooff, ofid, roff); + finfo = (VIOAPND2FINFO *)cdb_ht_get2(myio->idxmeta, &ofid, SI4, false); + if (finfo) + finfo->rcyled += page->osize; + } + + if (psize > myio->ibuf.limit) { + /* page too large */ + _vio_apnd2_flushbuf(vio, VIOAPND2_INDEX); + fid = myio->ibuf.fid; + roff = myio->ibuf.off; + _vio_apnd2_write(vio, myio->ibuf.fd, &page->magic, psize, true); + myio->ibuf.oid = page->oid; + _vio_apnd2_flushbuf(vio, VIOAPND2_INDEX); + cdb_lock_unlock(myio->lock); + + /* remember last wrote offset */ + ROFF2VOFF(fid, roff, *off); + page->ooff = *off; + page->osize = OFFALIGNED(psize); + return 0; + } else if (psize + myio->ibuf.pos > myio->ibuf.limit) + /* buffer is full */ + _vio_apnd2_flushbuf(vio, VIOAPND2_INDEX); + + /* copy to buffer */ + fid = myio->ibuf.fid; + roff = myio->ibuf.off + myio->ibuf.pos; + memcpy(myio->ibuf.buf + myio->ibuf.pos, &page->magic, psize); + myio->ibuf.pos += psize; + myio->ibuf.pos = OFFALIGNED(myio->ibuf.pos); + myio->ibuf.oid = page->oid; + cdb_lock_unlock(myio->lock); + ROFF2VOFF(fid, roff, *off); + + /* remember last wrote offset */ + page->ooff = *off; + page->osize = OFFALIGNED(psize); + return 0; +} + + +/* delete a record */ +static int _vio_apnd2_deleterec(CDBVIO *vio, CDBREC *rec, FOFF off) +{ + VIOAPND2 *myio = (VIOAPND2*)vio->iometa; + uint32_t ofid, roff; + + cdb_lock_lock(myio->lock); + myio->delbuf[myio->delbufpos] = off; + if (++myio->delbufpos == DELBUFMAX) { + if (_vio_apnd2_flushbuf(vio, VIOAPND2_DELLOG) < 0) + return -1; + } + + /* it is an deleted record, remember the space to be recycled */ + VOFF2ROFF(off, ofid, roff); + if (OFFNOTNULL(rec->ooff)) { + VIOAPND2FINFO *finfo = (VIOAPND2FINFO *)cdb_ht_get2(myio->datmeta, &ofid, SI4, false); + if (finfo) { + finfo->rcyled += rec->osize; + } + } + cdb_lock_unlock(myio->lock); + return 0; +} + + + +/* write a data record, return the written virtual offset */ +static int _vio_apnd2_writerec(CDBVIO *vio, CDBREC *rec, FOFF *off, int ptrtype) { + VIOAPND2 *myio = (VIOAPND2*)vio->iometa; + uint32_t rsize = RECSIZE(rec); + uint32_t fid, roff, ofid; + if (ptrtype == VIOAPND2_RECEXTERNAL) + rec->magic = RECMAGIC; + + /* oid always are increment, even if it is a record moved from an old data file */ + rec->oid = cdb_genoid(vio->db); + cdb_lock_lock(myio->lock); + /* buffer ready? */ + if (myio->dbuf.fd < 0) { + if (_vio_apnd2_shiftnew(vio, VIOAPND2_DATA) < 0) { + cdb_lock_unlock(myio->lock); + return -1; + } + } + /* it is an overwritten record, remember the space to be recycled */ + if (OFFNOTNULL(rec->ooff)) { + VOFF2ROFF(rec->ooff, ofid, roff); + VIOAPND2FINFO *finfo = (VIOAPND2FINFO *)cdb_ht_get2(myio->datmeta, &ofid, SI4, false); + if (finfo) + finfo->rcyled += rec->osize; + } + if (rsize > myio->dbuf.limit) { + /* record too large */ + _vio_apnd2_flushbuf(vio, VIOAPND2_DATA); + fid = myio->dbuf.fid; + roff = myio->dbuf.off; + _vio_apnd2_write(vio, myio->dbuf.fd, &rec->magic, RECHSIZE, true); + if (ptrtype == VIOAPND2_RECINTERNAL) + _vio_apnd2_write(vio, myio->dbuf.fd, rec->buf, rec->ksize + rec->vsize, false); + else { + _vio_apnd2_write(vio, myio->dbuf.fd, rec->key, rec->ksize, false); + _vio_apnd2_write(vio, myio->dbuf.fd, rec->val, rec->vsize, false); + } + /* reset the buffer */ + myio->dbuf.oid = rec->oid; + _vio_apnd2_flushbuf(vio, VIOAPND2_DATA); + if (rec->expire) { + VIOAPND2FINFO *finfo = (VIOAPND2FINFO *)cdb_ht_get2(myio->datmeta, &fid, SI4, false); + if (finfo) { + if (finfo->nexpire == 0) { + finfo->lcktime = time(NULL); + finfo->nexpire = rec->expire; + } else if (finfo->nexpire > rec->expire) { + finfo->nexpire = rec->expire; + } + } + } + cdb_lock_unlock(myio->lock); + ROFF2VOFF(fid, roff, *off); + return 0; + } else if (rsize + myio->dbuf.pos > myio->dbuf.limit) + /* buffer is full */ + _vio_apnd2_flushbuf(vio, VIOAPND2_DATA); + /* copy to buffer */ + fid = myio->dbuf.fid; + roff = myio->dbuf.off + myio->dbuf.pos; + memcpy(myio->dbuf.buf + myio->dbuf.pos, &rec->magic, RECHSIZE); + myio->dbuf.pos += RECHSIZE; + if (ptrtype == VIOAPND2_RECINTERNAL) { + memcpy(myio->dbuf.buf + myio->dbuf.pos, rec->buf, rec->ksize + rec->vsize); + myio->dbuf.pos += rec->ksize + rec->vsize; + } else { + memcpy(myio->dbuf.buf + myio->dbuf.pos, rec->key, rec->ksize); + myio->dbuf.pos += rec->ksize; + memcpy(myio->dbuf.buf + myio->dbuf.pos, rec->val, rec->vsize); + myio->dbuf.pos += rec->vsize; + } + myio->dbuf.pos = OFFALIGNED(myio->dbuf.pos); + myio->dbuf.oid = rec->oid; + if (rec->expire) { + VIOAPND2FINFO *finfo = (VIOAPND2FINFO *)cdb_ht_get2(myio->datmeta, &fid, SI4, false); + if (finfo) { + if (finfo->nexpire == 0) { + finfo->lcktime = time(NULL); + finfo->nexpire = rec->expire; + } else if (finfo->nexpire > rec->expire) { + finfo->nexpire = rec->expire; + } + } + } + ROFF2VOFF(fid, roff, *off); + cdb_lock_unlock(myio->lock); + rec->osize = rsize; + rec->ooff = *off; + return 0; +} + +static int _vio_apnd2_writerecexternal(CDBVIO *vio, CDBREC *rec, FOFF *off) +{ + return _vio_apnd2_writerec(vio, rec, off, VIOAPND2_RECEXTERNAL); +} + +static int _vio_apnd2_writerecinternal(CDBVIO *vio, CDBREC *rec, FOFF *off) +{ + return _vio_apnd2_writerec(vio, rec, off, VIOAPND2_RECINTERNAL); +} + + +/* flush buffers, and sync data to disk from OS cache */ +static int _vio_apnd2_sync(CDBVIO *vio) +{ + VIOAPND2 *myio = (VIOAPND2 *)vio->iometa; + cdb_lock_lock(myio->lock); + _vio_apnd2_flushbuf(vio, VIOAPND2_DATA); + _vio_apnd2_flushbuf(vio, VIOAPND2_INDEX); + if (myio->dbuf.fd > 0) + fdatasync(myio->dbuf.fd); + if (myio->ibuf.fd > 0) + fdatasync(myio->ibuf.fd); + + _vio_apnd2_writehead(vio, false); + cdb_lock_unlock(myio->lock); + return 0; +} + + +/* write db information and main index table into a single file */ +static int _vio_apnd2_writehead(CDBVIO *vio, bool wtable) +{ + VIOAPND2 *myio = (VIOAPND2 *)vio->iometa; + CDB *db = vio->db; + char buf[FILEMETASIZE]; + int pos = 0; + + memset(buf, 'X', FILEMETASIZE); + memcpy(buf, FILEMAGICHEADER, FILEMAGICLEN); + pos += FILEMAGICLEN; + *(uint32_t*)(buf + pos) = db->hsize; + pos += SI4; + *(uint64_t*)(buf + pos) = db->oid; + pos += SI8; + *(uint64_t*)(buf + pos) = db->roid; + pos += SI8; + *(uint64_t*)(buf + pos) = db->rnum; + pos += SI8; + *(uint32_t*)(buf + pos) = VIOAPND2_SIGOPEN; + pos += SI4; + + if (pwrite(myio->hfd, buf, FILEMETASIZE, 0) != FILEMETASIZE) { + cdb_seterrno(vio->db, CDB_WRITEERR, __FILE__, __LINE__); + return -1; + } + + if (wtable && pwrite(myio->hfd, db->mtable, sizeof(FOFF) * db->hsize, FILEMETASIZE) + != sizeof(FOFF) * db->hsize) { + cdb_seterrno(vio->db, CDB_WRITEERR, __FILE__, __LINE__); + return -1; + } + return 0; +} + + +/* wrapped for upper layer */ +static int _vio_apnd2_writehead2(CDBVIO *vio) +{ + return _vio_apnd2_writehead(vio, true); +} + + +/* read db information and main index table from a single file */ +static int _vio_apnd2_readhead(CDBVIO *vio, bool rtable) +{ + VIOAPND2 *myio = (VIOAPND2 *)vio->iometa; + CDB *db = vio->db; + char buf[FILEMETASIZE]; + int pos = 0; + + if (myio->create) { + /* the db is just created, allocate a empty main index table for db */ + db->mtable = (FOFF *)malloc(sizeof(FOFF) * db->hsize); + memset(db->mtable, 0, sizeof(FOFF) * db->hsize); + _vio_apnd2_writehead(vio, false); + return 0; + } + + if (pread(myio->hfd, buf, FILEMETASIZE, 0) != FILEMETASIZE) { + cdb_seterrno(db, CDB_READERR, __FILE__, __LINE__); + return -1; + } + + if (memcmp(buf, FILEMAGICHEADER, FILEMAGICLEN)) { + cdb_seterrno(db, CDB_DATAERRMETA, __FILE__, __LINE__); + return -1; + } + + pos += FILEMAGICLEN; + db->hsize = *(uint32_t*)(buf + pos); + pos += SI4; + db->oid = *(uint64_t*)(buf + pos); + pos += SI8; + db->roid = *(uint64_t*)(buf + pos); + pos += SI8; + db->rnum = *(uint64_t*)(buf + pos); + pos += SI8; + /* 4 bytes reserved for open status */ + pos += SI4; + + if (!rtable) + return 0; + + if (db->mtable) + free(db->mtable); + db->mtable = (FOFF *)malloc(sizeof(FOFF) * db->hsize); + if (pread(myio->hfd, db->mtable, sizeof(FOFF) * db->hsize, FILEMETASIZE) != + sizeof(FOFF) * db->hsize) { + free(db->mtable); + cdb_seterrno(db, CDB_READERR, __FILE__, __LINE__); + return -1; + } + return 0; +} + + +/* wrapped for upper layer */ +static int _vio_apnd2_readhead2(CDBVIO *vio) +{ + return _vio_apnd2_readhead(vio, true); +} + + +/* check if some dat file has too large junk space */ +static void _vio_apnd2_rcyledataspacetask(void *arg) +{ + CDBVIO *vio = (CDBVIO *)arg; + VIOAPND2 *myio = (VIOAPND2 *)vio->iometa; + CDBHTITEM *item; + uint32_t now = time(NULL); + uint32_t posblexpnum = 0; + cdb_lock_lock(myio->lock); + item = cdb_ht_iterbegin(myio->datmeta); + while(item != NULL) { + VIOAPND2FINFO *finfo = (VIOAPND2FINFO*)cdb_ht_itemval(myio->datmeta, item); + if (finfo->nexpire && finfo->nexpire <= now) + posblexpnum++; + item = cdb_ht_iternext(myio->datmeta, item); + } + + item = cdb_ht_iterbegin(myio->datmeta); + while(item != NULL) { + VIOAPND2FINFO *finfo = (VIOAPND2FINFO*)cdb_ht_itemval(myio->datmeta, item); + uint32_t fid = finfo->fid; + /* rcyled space size is inaccurate */ + if (finfo->rcyled * 2 < finfo->fsize + /* no data file possibly has expire record */ + && (posblexpnum == 0 + /* long enough time passed since last check on this file */ + || finfo->lcktime + posblexpnum * DATARCYLECHECKFACTOR > now + /* check the data file most recent expire record */ + || finfo->nexpire > now + /* no expire record */ + || finfo->nexpire == 0)) { + item = cdb_ht_iternext(myio->datmeta, item); + continue; + } + + /* do not work on the writing file or file to be deleted */ + if (finfo->fstatus != VIOAPND2_FULL || finfo->unlink) { + item = cdb_ht_iternext(myio->datmeta, item); + continue; + } + + /* have to iterate and calculate recycle space */ + finfo->ref++; + /* operation on this file should not in lock protection */ + cdb_lock_unlock(myio->lock); + + if (finfo->rcyled * 2 < finfo->fsize) { + _vio_apnd2_rcyledatafile(vio, finfo, false); + finfo->lcktime = now; + } + + if (finfo->rcyled * 2 >= finfo->fsize) { + _vio_apnd2_rcyledatafile(vio, finfo, true); + } + + cdb_lock_lock(myio->lock); + finfo->ref--; + if (finfo->ref == 0 && finfo->unlink) { + /* unlink the file */ + _vio_apnd2_unlink(vio, finfo, VIOAPND2_DATA); + cdb_ht_del2(myio->datmeta, &fid, SI4); + } + item = cdb_ht_iterbegin(myio->datmeta); + } + cdb_lock_unlock(myio->lock); +} + +/* only be called in _vio_apnd2_rcylepagespacetask; when a page is moved into a new + index file, its ooff should be changed, also its copy in cache should be updated */ +static void _vio_apnd2_fixcachepageooff(CDB *db, uint32_t bid, FOFF off) +{ + CDBPAGE *page = NULL; + + if (db->pcache) { + cdb_lock_lock(db->pclock); + page = cdb_ht_get2(db->pcache, &bid, SI4, true); + cdb_lock_unlock(db->pclock); + } + + /* not in pcache, exists in dirty page cache? */ + if (page == NULL && db->dpcache) { + cdb_lock_lock(db->dpclock); + page = cdb_ht_get2(db->dpcache, &bid, SI4, true); + cdb_lock_unlock(db->dpclock); + } + + if (page) + page->ooff = off; +} + +/* check if some index file has too large junk space */ +static void _vio_apnd2_rcylepagespacetask(void *arg) +{ + CDBVIO *vio = (CDBVIO *)arg; + VIOAPND2 *myio = (VIOAPND2 *)vio->iometa; + CDBHTITEM *item; + + cdb_lock_lock(myio->lock); + item = cdb_ht_iterbegin(myio->idxmeta); + while(item != NULL) { + VIOAPND2FINFO *finfo = (VIOAPND2FINFO*)cdb_ht_itemval(myio->idxmeta, item); + uint32_t fid = finfo->fid; + + /* do not work on the writing file or file to be deleted */ + if (finfo->fstatus != VIOAPND2_FULL || finfo->unlink) { + item = cdb_ht_iternext(myio->idxmeta, item); + continue; + } + + /* junk space too large? */ + if (finfo->rcyled * 2 > finfo->fsize) { + int fd; + char filename[MAX_PATH_LEN]; + snprintf(filename, MAX_PATH_LEN, "%s/idx%08d.cdb", myio->filepath, fid); + fd = open(filename, O_RDONLY, 0644); + if (fd < 0) { + cdb_seterrno(vio->db, CDB_OPENERR, __FILE__, __LINE__); + item = cdb_ht_iternext(myio->idxmeta, item); + continue; + } + finfo->ref++; + /* I/O should not block the lock */ + cdb_lock_unlock(myio->lock); + + uint32_t fsize = lseek(fd, 0, SEEK_END); + uint32_t pos = FILEMETASIZE; + char *map = mmap(NULL, fsize, PROT_READ, MAP_PRIVATE, fd, 0); + while(pos < fsize) { + CDBPAGE *page = (CDBPAGE *)&map[pos-(sizeof(CDBPAGE) - PAGEHSIZE)]; + FOFF off; + + if (page->magic != PAGEMAGIC) { + pos += ALIGNBYTES; + continue; + } + + ROFF2VOFF(fid, pos, off); + page->ooff = off; + page->osize = OFFALIGNED(PAGESIZE(page)); + if (OFFEQ(vio->db->mtable[page->bid], off)) { + FOFF noff; + _vio_apnd2_writepage(vio, page, &noff); + /* lock and double check */ + cdb_lock_lock(vio->db->mlock[page->bid % MLOCKNUM]); + if (OFFEQ(vio->db->mtable[page->bid], off)) { + vio->db->mtable[page->bid] = noff; + _vio_apnd2_fixcachepageooff(vio->db, page->bid, noff); + } + cdb_lock_unlock(vio->db->mlock[page->bid % MLOCKNUM]); + } + pos += OFFALIGNED(PAGESIZE(page)); + } + munmap(map, fsize); + close(fd); + + cdb_lock_lock(myio->lock); + /* drop information for the file */ + finfo->ref--; + finfo->unlink = true; + if (finfo->ref == 0) { + /* unlink the file */ + _vio_apnd2_unlink(vio, finfo, VIOAPND2_INDEX); + cdb_ht_del2(myio->idxmeta, &fid, SI4); + } + /* reset the iterator */ + item = cdb_ht_iterbegin(myio->idxmeta); + continue; + } + item = cdb_ht_iternext(myio->idxmeta, item); + } + cdb_lock_unlock(myio->lock); +} + + +/* unlink a file and remove fd from fdcache. The function runs under lock protection */ +static void _vio_apnd2_unlink(CDBVIO *vio, VIOAPND2FINFO *finfo, int dtype) +{ + VIOAPND2 *myio = (VIOAPND2 *)vio->iometa; + char filename[MAX_PATH_LEN]; + char ipfx[] = "idx"; + char dpfx[] = "dat"; + char *pfx; + uint32_t *fnum; + uint32_t vfid, fid = finfo->fid; + VIOAPND2FINFO **fhead, **ftail; + CDBHTITEM *fditem = NULL; + + if (dtype == VIOAPND2_INDEX) { + pfx = ipfx; + vfid = VFIDIDX(fid); + fnum = &myio->ifnum; + fhead = &myio->idxfhead; + ftail = &myio->idxftail; + } else if (dtype == VIOAPND2_DATA) { + pfx = dpfx; + vfid = VFIDDAT(fid); + fnum = &myio->dfnum; + fhead = &myio->datfhead; + ftail = &myio->datftail; + } else + return; + + snprintf(filename, MAX_PATH_LEN, "%s/%s%08d.cdb", myio->filepath, pfx, fid); + fditem = cdb_ht_del(myio->fdcache, &vfid, SI4); + if (fditem != NULL) { + close(*(int*)cdb_ht_itemval(myio->fdcache, fditem)); + free(fditem); + } + (*fnum)--; + unlink(filename); + + /* fix linked list of data/index files after remove a finfo from meta table */ + if (finfo->fprev) + finfo->fprev->fnext = finfo->fnext; + if (finfo->fnext) + finfo->fnext->fprev = finfo->fprev; + if (*fhead == finfo) + *fhead = finfo->fnext; + if (*ftail == finfo) + *ftail = finfo->fprev; +} + + +/* only be used for sorting files at recovery */ +typedef struct { + uint32_t fid; + uint64_t oidf; +} VIOAPND2SREORDER; + + +static int _vio_apnd2_cmpfuncsreorder(const void *p1, const void *p2) +{ + VIOAPND2SREORDER *s1, *s2; + s1 = (VIOAPND2SREORDER *)p1; + s2 = (VIOAPND2SREORDER *)p2; + return s1->oidf - s2->oidf; +} + + +/* recovery the database if it was not close properly + * or force recovery from roid = 0 + * the procedure runs with no lock protection */ +static int _vio_apnd2_recovery(CDBVIO *vio, bool force) +{ + VIOAPND2 *myio = (VIOAPND2 *)vio->iometa; + CDB *db = vio->db; + char filename[MAX_PATH_LEN]; + struct dirent *filelist; + VIOAPND2SREORDER *idxorders; + int idxpos, idxlimit; + VIOAPND2SREORDER *datorders; + int datpos, datlimit; + uint32_t imaxfid = 0, dmaxfid = 0; + bool gotmindex = false; + + + idxpos = datpos = 0; + idxlimit = datlimit = 256; + idxorders = (VIOAPND2SREORDER *)malloc(idxlimit * sizeof(VIOAPND2SREORDER)); + datorders = (VIOAPND2SREORDER *)malloc(datlimit * sizeof(VIOAPND2SREORDER)); + DIR *dir = opendir(myio->filepath); + myio->dfnum = myio->ifnum = 0; + myio->datfhead = myio->datftail = myio->idxfhead = myio->idxftail = NULL; + /* special value to mark if found current writing file */ + myio->ibuf.fid = myio->dbuf.fid = -1; + for (filelist = readdir(dir); filelist; filelist = readdir(dir)) { + // Check file name/type + const char *cstr = filelist->d_name; + if (strncmp(cstr + strlen(cstr) - 4, ".cdb", 4) != 0) + /* not a cuttdb file*/ + continue; + if (strcmp(cstr, "dellog.cdb") == 0) { + snprintf(filename, MAX_PATH_LEN, "%s/%s", myio->filepath, cstr); + myio->dfd = open(filename, O_RDONLY, 0644); + } else if (strcmp(cstr, "mainindex.cdb") == 0) { + gotmindex = true; +// snprintf(filename, MAX_PATH_LEN, "%s/%s", myio->filepath, cstr); +// myio->hfd = open(filename, O_RDONLY, 0644); +// if (_vio_apnd2_readhead(vio, false) < 0 || db->hsize == 0) { +// goto ERRRET; +// } +// db->mtable = (FOFF *)malloc(sizeof(FOFF) * db->hsize); +// gotmindex = true; +// memset(db->mtable, 0, sizeof(FOFF) * db->hsize); + } else if (strcmp(cstr, "mainmeta.cdb") == 0) { + snprintf(filename, MAX_PATH_LEN, "%s/%s", myio->filepath, cstr); + myio->mfd = open(filename, O_RDWR, 0644); + if (myio->mfd < 0) { + cdb_seterrno(vio->db, CDB_OPENERR, __FILE__, __LINE__); + continue; + } + } else if (strlen(cstr) == 15 + && (strncmp(cstr, "dat", 3) == 0 || strncmp(cstr, "idx", 3) == 0)) { + VIOAPND2FINFO finfo; + uint64_t fsize = 0; + + snprintf(filename, MAX_PATH_LEN, "%s/%s", myio->filepath, cstr); + int fd = open(filename, O_RDWR, 0644); + if (fd < 0) { + cdb_seterrno(vio->db, CDB_OPENERR, __FILE__, __LINE__); + continue; + } + if (_vio_apnd2_readfmeta(vio, fd, &finfo) < 0) { + close(fd); + continue; + } + fsize = lseek(fd, 0, SEEK_END); + finfo.rcyled = 0; + finfo.ref = 0; + finfo.unlink = false; + finfo.fprev = finfo.fnext = NULL; + if (finfo.ftype == VIOAPND2_INDEX) { + if (force) { + /* delete all index file and rebuild them if force to recovery */ + close(fd); + unlink(filename); + } else { + cdb_ht_insert2(myio->idxmeta, &finfo.fid, SI4, &finfo, sizeof(VIOAPND2FINFO)); + idxorders[idxpos].fid = finfo.fid; + idxorders[idxpos].oidf = finfo.oidf; + if (++idxpos == idxlimit) { + VIOAPND2SREORDER *tmp = (VIOAPND2SREORDER *)malloc(idxlimit * 2 * sizeof(VIOAPND2SREORDER)); + memcpy(tmp, idxorders, idxlimit * sizeof(VIOAPND2SREORDER)); + idxlimit *= 2; + free(idxorders); + idxorders = tmp; + } + if(finfo.fstatus == VIOAPND2_WRITING) { + myio->ibuf.fid = finfo.fid; + myio->ibuf.off = OFFALIGNED(fsize); + myio->ibuf.pos = 0; + myio->ibuf.fd = fd; + } else + close(fd); + if (finfo.fid > imaxfid) + imaxfid = finfo.fid; + myio->ifnum++; + } + } else if (finfo.ftype == VIOAPND2_DATA) { + /* no information about nearest expire record time, make a fake one(non zero) */ + finfo.nexpire = finfo.lcktime = time(NULL); + cdb_ht_insert2(myio->datmeta, &finfo.fid, SI4, &finfo, sizeof(VIOAPND2FINFO)); + datorders[datpos].fid = finfo.fid; + datorders[datpos].oidf = finfo.oidf; + if (++datpos == datlimit) { + VIOAPND2SREORDER *tmp = (VIOAPND2SREORDER *)malloc(datlimit * 2 * sizeof(VIOAPND2SREORDER)); + memcpy(tmp, datorders, datlimit * sizeof(VIOAPND2SREORDER)); + datlimit *= 2; + free(datorders); + datorders = tmp; + } + if (finfo.fstatus == VIOAPND2_WRITING) { + myio->dbuf.fid = finfo.fid; + myio->dbuf.off = OFFALIGNED(fsize); + myio->dbuf.pos = 0; + myio->dbuf.fd = fd; + } else + close(fd); + if (finfo.fid > dmaxfid) + dmaxfid = finfo.fid; + myio->dfnum++; + } else + close(fd); + } /* end of else */ + } /* end of for */ + + + /* fix recycled size */ + _vio_apnd2_readmeta(vio, true); + closedir(dir); + + if (!gotmindex) { + /* recovery failed */ + /* return */ + goto ERRRET; + } else { + if (_vio_apnd2_readhead(vio, false) < 0) + goto ERRRET; + } + + if (myio->mfd < 0) { + snprintf(filename, MAX_PATH_LEN, "%s/mainmeta.cdb", myio->filepath); + myio->mfd = open(filename, O_RDWR | O_CREAT, 0644); + if (myio->mfd < 0) { + cdb_seterrno(vio->db, CDB_OPENERR, __FILE__, __LINE__); + goto ERRRET; + } + } + + /* index file complele broken, replay all records to build the index */ + if (myio->ifnum == 0 || force) + db->roid = 0; + /* re-count records num */ + db->rnum = 0; + + /* fix index/data file meta relation */ + qsort(datorders, datpos, sizeof(VIOAPND2SREORDER), _vio_apnd2_cmpfuncsreorder); + qsort(idxorders, idxpos, sizeof(VIOAPND2SREORDER), _vio_apnd2_cmpfuncsreorder); + + VIOAPND2FINFO *lfinfo = NULL; + for(int i = 0; i < datpos; i++) { + VIOAPND2FINFO *cfinfo = (VIOAPND2FINFO *)cdb_ht_get2(myio->datmeta, &datorders[i].fid, SI4, false); + if (cfinfo == NULL) + continue; + if (lfinfo) + lfinfo->fnext = cfinfo; + else { + myio->datfhead = cfinfo; + } + cfinfo->fprev = lfinfo; + lfinfo = cfinfo; + } + myio->datftail = lfinfo; + if (lfinfo) + lfinfo->fnext = NULL; + lfinfo = NULL; + for(int i = 0; i < idxpos; i++) { + VIOAPND2FINFO *cfinfo = (VIOAPND2FINFO *)cdb_ht_get2(myio->idxmeta, &idxorders[i].fid, SI4, false); + if (cfinfo == NULL) + continue; + if (lfinfo) + lfinfo->fnext = cfinfo; + else { + myio->idxfhead = cfinfo; + } + cfinfo->fprev = lfinfo; + lfinfo = cfinfo; + } + myio->idxftail = lfinfo; + if (lfinfo) + lfinfo->fnext = NULL; + lfinfo = NULL; + + if (myio->ibuf.fid == -1) { + myio->ibuf.fid = 0; + _vio_apnd2_shiftnew(vio, VIOAPND2_INDEX); + } + if (myio->dbuf.fid == -1) { + myio->dbuf.fid = 0; + _vio_apnd2_shiftnew(vio, VIOAPND2_DATA); + } + + /* fix offsets in main index table */ + db->mtable = (FOFF *)malloc(db->hsize * sizeof(FOFF)); + memset(db->mtable, 0, db->hsize * sizeof(FOFF)); + void *it = _vio_apnd2_pageiterfirst(vio, 0); + if (it) { + char sbuf[SBUFSIZE]; + CDBPAGE *page = (CDBPAGE *)sbuf; + /* need not use iterator since don't care about contents in page */ + /* I'm just lazy, cpu time is cheap */ + while(_vio_apnd2_pageiternext(vio, &page, it) == 0) { + if (OFFNOTNULL(db->mtable[page->bid])) { + /* recalculate the space to be recycled */ + uint32_t ofid, roff; + char sbuf[SBUFSIZE]; + CDBPAGE *opage = (CDBPAGE *)sbuf; + _vio_apnd2_readpage(vio, &opage, db->mtable[page->bid]); + if (OFFNOTNULL(opage->ooff)) { + VOFF2ROFF(opage->ooff, ofid, roff); + VIOAPND2FINFO *finfo = (VIOAPND2FINFO *)cdb_ht_get2(myio->idxmeta, &ofid, SI4, false); + if (finfo) + finfo->rcyled += opage->osize; + } + /* fix impaction of old page */ + db->rnum -= opage->num; + if (opage != (CDBPAGE *)sbuf) + free(opage); + } + db->mtable[page->bid] = page->ooff; + db->rnum += page->num; + if (page != (CDBPAGE *)sbuf) { + free(page); + page = (CDBPAGE *)sbuf; + } + } + _vio_apnd2_pageiterdestory(vio, it); + } + + /* like what was did just now */ + it = _vio_apnd2_reciterfirst(vio, db->roid); + if (it) { + char sbuf[SBUFSIZE]; + CDBREC *rec = (CDBREC *)sbuf; + while(_vio_apnd2_reciternext(vio, &rec, it) == 0) { + FOFF soffs[SFOFFNUM]; + FOFF *soff = soffs, ooff; + char sbuf2[SBUFSIZE]; + OFFZERO(ooff); + CDBREC *rrec = (CDBREC*)sbuf2; + uint64_t hash = CDBHASH64(rec->buf, rec->ksize); + + /* check record with duplicate key(old version/overwritten maybe */ + int retnum = cdb_getoff(db, hash, &soff, CDB_NOTLOCKED); + for(int i = 0; i < retnum; i++) { + if (rrec != (CDBREC*)sbuf2) { + free(rrec); + rrec = (CDBREC*)sbuf2; + } + + int cret = _vio_apnd2_readrec(db->vio, &rrec, soff[i], false); + if (cret < 0) + continue; + + if (rec->ksize == rrec->ksize && memcmp(rrec->key, rec->key, rec->ksize) == 0) { + ooff = rrec->ooff; + break; + } + } + if (soff != soffs) + free(soff); + if (rrec != (CDBREC*)sbuf2) + free(rrec); + + if (OFFNOTNULL(ooff)) + /* replace offset in index */ + cdb_replaceoff(db, hash, ooff, rec->ooff, CDB_NOTLOCKED); + else + cdb_updatepage(vio->db, hash, rec->ooff, CDB_PAGEINSERTOFF, CDB_NOTLOCKED); + + if (rec->oid > db->oid) + db->oid = rec->oid; + if (rec != (CDBREC *)sbuf) { + free(rec); + rec = (CDBREC *)sbuf; + } + } + _vio_apnd2_reciterdestory(vio, it); + } + + /* replay deletion logs */ + FOFF delitems[1024]; + for(; myio->dfd > 0;) { + int ret = read(myio->dfd, delitems, 1024 * sizeof(FOFF)); + if (ret > 0) { + for(int j = 0; j * sizeof(FOFF) < ret; j++) { + char sbuf[SBUFSIZE]; + uint32_t ofid, roff; + CDBREC *rec = (CDBREC *)sbuf; + if (_vio_apnd2_readrec(vio, &rec, delitems[j], false) < 0) + continue; + if (cdb_updatepage(db, CDBHASH64(rec->key, rec->ksize), + delitems[j], CDB_PAGEDELETEOFF, CDB_NOTLOCKED) == 0) + VOFF2ROFF(delitems[j], ofid, roff); + VIOAPND2FINFO *finfo = (VIOAPND2FINFO *)cdb_ht_get2(myio->datmeta, &ofid, SI4, false); + if (finfo) + finfo->rcyled += rec->osize; + if (rec != (CDBREC *)sbuf) + free(rec); + } + } else { + close(myio->dfd); + myio->dfd = -1; + } + } + + cdb_flushalldpage(db); + _vio_apnd2_writemeta(vio); + _vio_apnd2_writehead(vio, true); + cdb_ht_clean(myio->idxmeta); + cdb_ht_clean(myio->datmeta); + free(idxorders); + free(datorders); + /* mfd / dfd will be opened again after this function, but hfd won't be */ + myio->datfhead = myio->datftail = myio->idxfhead = myio->idxftail = NULL; + if (myio->ibuf.fd > 0) + close(myio->ibuf.fd); + if (myio->dbuf.fd > 0) + close(myio->dbuf.fd); + if (myio->mfd > 0) + close(myio->mfd); + if (myio->dfd > 0) + close(myio->dfd); + return 0; + +ERRRET: + closedir(dir); + if (myio->hfd > 0) + close(myio->hfd); + if (myio->mfd > 0) + close(myio->mfd); + if (myio->dfd > 0) + close(myio->dfd); + free(datorders); + free(idxorders); + return -1; +} + + +static VIOAPND2FINFO* _vio_apnd2_fileiternext(CDBVIO *vio, int dtype, uint64_t oid) +{ + VIOAPND2 *myio = (VIOAPND2 *)vio->iometa; + uint64_t foid = (uint64_t)-1; + CDBHTITEM *item; + CDBHASHTABLE *ht; + VIOAPND2FINFO *finfo = NULL; + + if (dtype == VIOAPND2_INDEX) + ht = myio->idxmeta; + else if (dtype == VIOAPND2_DATA) + ht = myio->datmeta; + else + return NULL; + + cdb_lock_lock(myio->lock); + item = cdb_ht_iterbegin(ht); + while(item) { + VIOAPND2FINFO *tfinfo = (VIOAPND2FINFO *)cdb_ht_itemval(ht, item); + if (tfinfo->oidf < foid && tfinfo->oidf >= oid) { + foid = tfinfo->oidf; + finfo = tfinfo; + } + item = cdb_ht_iternext(ht, item); + } + if (finfo) + finfo->ref++; + cdb_lock_unlock(myio->lock); + return finfo; +} + +static int _vio_apnd2_iterfirst(CDBVIO *vio, VIOAPND2ITOR *it, int dtype, int64_t oid) +{ + VIOAPND2 *myio = (VIOAPND2 *)vio->iometa; + CDBHASHTABLE *tmpcache; + char filename[MAX_PATH_LEN]; + char ipfx[] = "idx"; + char dpfx[] = "dat"; + char *pfx; + + if (dtype == VIOAPND2_INDEX) { + pfx = ipfx; + tmpcache = myio->idxmeta; + } else if (dtype == VIOAPND2_DATA) { + pfx = dpfx; + tmpcache = myio->datmeta; + } else + return -1; + + if (it->finfo == NULL) + it->finfo = _vio_apnd2_fileiternext(vio, dtype, oid); + if (it->finfo == NULL) { + return -1; + } + + snprintf(filename, MAX_PATH_LEN, "%s/%s%08d.cdb", myio->filepath, pfx, it->finfo->fid); + it->fd = open(filename, O_RDONLY, 0644); + if (it->fd < 0) { + cdb_lock_lock(myio->lock); + it->finfo->ref--; + if (it->finfo->ref == 0 && it->finfo->unlink) { + /* unlink the file */ + _vio_apnd2_unlink(vio, it->finfo, dtype); + cdb_ht_del2(tmpcache, &it->finfo->fid, SI4); + } + cdb_lock_unlock(myio->lock); + cdb_seterrno(vio->db, CDB_OPENERR, __FILE__, __LINE__); + return -1; + } + + it->fsize = lseek(it->fd, 0, SEEK_END); + it->mmap = mmap(NULL, it->fsize, PROT_READ, MAP_PRIVATE, it->fd, 0); + it->off = FILEMETASIZE; + it->oid = oid; + + while(it->off < it->fsize) { + if (dtype == VIOAPND2_INDEX) { + CDBPAGE *page = (CDBPAGE *)(it->mmap + it->off -(sizeof(CDBPAGE) - PAGEHSIZE)); + if (page->magic != PAGEMAGIC) { + it->off += ALIGNBYTES; + continue; + } + if (page->oid >= oid) + break; + it->off += OFFALIGNED(PAGESIZE(page)); + } else if (dtype == VIOAPND2_DATA) { + CDBREC *rec = (CDBREC *)(it->mmap + it->off -(sizeof(CDBREC) - RECHSIZE)); + if (rec->magic != RECMAGIC && rec->magic != DELRECMAGIC) { + it->off += ALIGNBYTES; + continue; + } + if (rec->oid >= oid) + break; + it->off += OFFALIGNED(RECSIZE(rec)); + } + } + + if (it->off >= it->fsize) { + munmap(it->mmap, it->fsize); + close(it->fd); + cdb_lock_lock(myio->lock); + it->finfo->ref--; + if (it->finfo->ref == 0 && it->finfo->unlink) { + /* unlink the file */ + _vio_apnd2_unlink(vio, it->finfo, dtype); + cdb_ht_del2(tmpcache, &it->finfo->fid, SI4); + } + cdb_lock_unlock(myio->lock); + return -1; + } + return 0; +} + + +static int _vio_apnd2_pageiternext(CDBVIO *vio, CDBPAGE **page, void *iter) +{ + VIOAPND2ITOR *it = (VIOAPND2ITOR *)iter; + CDBPAGE *cpage; + uint32_t fixbufsize = SBUFSIZE - (sizeof(CDBPAGE) - PAGEHSIZE); + + for(;;) { + if (it->off >= it->fsize) { + it->oid = CDBMAX(it->oid, it->finfo->oidl); + _vio_apnd2_iterfree(vio, VIOAPND2_INDEX, it); + if (_vio_apnd2_iterfirst(vio, it, VIOAPND2_INDEX, it->oid) < 0) + return -1; + } + cpage = (CDBPAGE *)(it->mmap + it->off -(sizeof(CDBPAGE) - PAGEHSIZE)); + if (cpage->magic != PAGEMAGIC) { + it->off += ALIGNBYTES; + continue; + } + if (PAGESIZE(cpage) <= fixbufsize) + memcpy(&(*page)->magic, &cpage->magic, PAGESIZE(cpage)); + else { + *page = (CDBPAGE *)malloc(sizeof(CDBPAGE) + (*page)->num * sizeof(PITEM)); + memcpy(&(*page)->magic, &cpage->magic, PAGESIZE(cpage)); + } + (*page)->osize = PAGESIZE(cpage); + (*page)->cap = (*page)->num; + ROFF2VOFF(it->finfo->fid, it->off, (*page)->ooff); + /* set iterator to next one */ + it->oid = (*page)->oid + 1; + it->off += OFFALIGNED(PAGESIZE(cpage)); + return 0; + } + return -1; +} + +static int _vio_apnd2_reciternext(CDBVIO *vio, CDBREC **rec, void *iter) +{ + VIOAPND2ITOR *it = (VIOAPND2ITOR *)iter; + CDBREC *crec; + uint32_t fixbufsize = SBUFSIZE - (sizeof(CDBREC) - RECHSIZE); + + for(;;) { + if (it->off >= it->fsize) { + it->oid = CDBMAX(it->oid, it->finfo->oidl); + _vio_apnd2_iterfree(vio, VIOAPND2_DATA, it); + if (_vio_apnd2_iterfirst(vio, it, VIOAPND2_DATA, it->oid) < 0) + return -1; + } + crec = (CDBREC *)(it->mmap + it->off -(sizeof(CDBREC) - RECHSIZE)); + if (crec->magic != RECMAGIC && crec->magic != DELRECMAGIC) { + it->off += ALIGNBYTES; + continue; + } + if (RECSIZE(crec) <= fixbufsize) + memcpy(&(*rec)->magic, &crec->magic, RECSIZE(crec)); + else { + *rec = (CDBREC *)malloc(sizeof(CDBREC) + crec->ksize + crec->vsize); + memcpy(&(*rec)->magic, &crec->magic, RECSIZE(crec)); + } + + (*rec)->osize = RECSIZE(crec); + (*rec)->expire = crec->expire; + ROFF2VOFF(it->finfo->fid, it->off, (*rec)->ooff); + (*rec)->key = (*rec)->buf; + (*rec)->val = (*rec)->buf + (*rec)->ksize; + + /* set iterator to next one */ + it->oid = (*rec)->oid + 1; + it->off += OFFALIGNED(RECSIZE(crec)); + return 0; + } + return -1; +} + + +static int _vio_apnd2_iterfree(CDBVIO *vio, int dtype, VIOAPND2ITOR *it) +{ + VIOAPND2 *myio = (VIOAPND2 *)vio->iometa; + if (it->mmap) { + munmap(it->mmap, it->fsize); + close(it->fd); + cdb_lock_lock(myio->lock); + it->finfo->ref--; + if (it->finfo->ref == 0 && it->finfo->unlink) { + /* unlink the file */ + VIOAPND2FINFO *tfinfo; + it->finfo->fnext->fprev = it->finfo->fprev; + it->finfo->fprev->fnext = it->finfo->fnext; + tfinfo = it->finfo; + it->finfo = it->finfo->fnext; + _vio_apnd2_unlink(vio, tfinfo, dtype); + if (dtype == VIOAPND2_INDEX) + cdb_ht_del2(myio->idxmeta, &tfinfo->fid, SI4); + else if (dtype == VIOAPND2_DATA) + cdb_ht_del2(myio->datmeta, &tfinfo->fid, SI4); + } else + it->finfo = it->finfo->fnext; + if (it->finfo) + it->finfo->ref++; + cdb_lock_unlock(myio->lock); + it->mmap = NULL; + } + return 0; +} + + +static void* _vio_apnd2_reciterfirst(CDBVIO *vio, uint64_t oid) +{ + VIOAPND2ITOR *it = (VIOAPND2ITOR *)malloc(sizeof(VIOAPND2ITOR)); + + /* iterator won't get to buffered data */ + _vio_apnd2_flushbuf(vio, VIOAPND2_DATA); + it->mmap = NULL; + it->finfo = NULL; + if (_vio_apnd2_iterfirst(vio, it, VIOAPND2_DATA, oid) < 0) { + free(it); + return NULL; + } + return (void*)it; +} + + +static void _vio_apnd2_reciterdestory(CDBVIO *vio, void *iter) +{ + if (iter) { + _vio_apnd2_iterfree(vio, VIOAPND2_DATA, (VIOAPND2ITOR *)iter); + free(iter); + } +} + +static void* _vio_apnd2_pageiterfirst(CDBVIO *vio, uint64_t oid) +{ + VIOAPND2ITOR *it = (VIOAPND2ITOR *)malloc(sizeof(VIOAPND2ITOR)); + + /* iterator won't get to buffered data */ + _vio_apnd2_flushbuf(vio, VIOAPND2_INDEX); + it->mmap = NULL; + it->finfo = NULL; + if (_vio_apnd2_iterfirst(vio, it, VIOAPND2_INDEX, oid) < 0) { + free(it); + return NULL; + } + return (void*)it; +} + + +static void _vio_apnd2_pageiterdestory(CDBVIO *vio, void *iter) +{ + if (iter) { + _vio_apnd2_iterfree(vio, VIOAPND2_INDEX, (VIOAPND2ITOR *)iter); + free(iter); + } +} + +static int _vio_apnd2_rcyledatafile(CDBVIO *vio, VIOAPND2FINFO *finfo, bool rcyle) +{ + VIOAPND2 *myio = (VIOAPND2 *)vio->iometa; + int fd; + char filename[MAX_PATH_LEN]; + uint32_t nexpire = 0xffffffff; + + snprintf(filename, MAX_PATH_LEN, "%s/dat%08d.cdb", myio->filepath, finfo->fid); + fd = open(filename, O_RDONLY, 0644); + if (fd < 0) { + cdb_seterrno(vio->db, CDB_OPENERR, __FILE__, __LINE__); + return -1; + } + + uint32_t frsize = 0, fsize = lseek(fd, 0, SEEK_END); + uint32_t pos = FILEMETASIZE; + char *map = mmap(NULL, fsize, PROT_READ, MAP_PRIVATE, fd, 0); + uint32_t now = time(NULL); + while(pos < fsize) { + CDBREC *rec = (CDBREC *)&map[pos-(sizeof(CDBREC) - RECHSIZE)]; + FOFF off; + uint64_t hash; + + if (rec->magic != RECMAGIC && rec->magic != DELRECMAGIC) { + pos += ALIGNBYTES; + continue; + } + + ROFF2VOFF(finfo->fid, pos, off); + hash = CDBHASH64(rec->buf, rec->ksize); + if (cdb_checkoff(vio->db, hash, off, CDB_NOTLOCKED) + /* not expired */ + && (rec->expire > now || rec->expire == 0)) { + /* nearest expire record in current file */ + if (rec->expire && rec->expire < nexpire) + nexpire = rec->expire; + + /* record exist in index, skip */ + if (rcyle) { + FOFF noff; + rec->ooff = off; + rec->osize = OFFALIGNED(RECSIZE(rec)); + _vio_apnd2_writerecinternal(vio, rec, &noff); + cdb_replaceoff(vio->db, hash, off, noff, CDB_NOTLOCKED); + } + } else { + if (rcyle && rec->expire && rec->expire < now) { + /* expired record, delete from index page */ + cdb_updatepage(vio->db, hash, off, CDB_PAGEDELETEOFF, CDB_NOTLOCKED); + } + frsize += OFFALIGNED(RECSIZE(rec)); + } + pos += OFFALIGNED(RECSIZE(rec)); + } + munmap(map, fsize); + close(fd); + cdb_lock_lock(myio->lock); + /* fix metainfo about nearest expire time in current data file */ + if (nexpire == 0xffffffff) + finfo->nexpire = 0; + else + finfo->nexpire = nexpire; + finfo->rcyled = frsize; + if (rcyle) { + /* unlink */ + finfo->unlink = true; + } + cdb_lock_unlock(myio->lock); + return 0; +} + + +static void _vio_apnd2_cleanpoint(CDBVIO *vio) +{ + VIOAPND2 *myio = (VIOAPND2 *)vio->iometa; + char filename[MAX_PATH_LEN]; + + cdb_lock_lock(myio->lock); + _vio_apnd2_flushbuf(vio, VIOAPND2_DATA); + _vio_apnd2_flushbuf(vio, VIOAPND2_INDEX); + _vio_apnd2_writehead(vio, false); + if (myio->dfd > 0) + close(myio->dfd); + snprintf(filename, MAX_PATH_LEN, "%s/dellog.cdb", myio->filepath); + /* clean the previous deletion log */ + myio->dfd = open(filename, O_WRONLY | O_CREAT | O_TRUNC, 0644); + /* open failed, whom to tell? */ + if (myio->dfd < 0) + cdb_seterrno(vio->db, CDB_OPENERR, __FILE__, __LINE__); + cdb_lock_unlock(myio->lock); +} + + +static int _vio_apnd2_checkopensig(CDBVIO *vio) +{ + VIOAPND2 *myio = (VIOAPND2 *)vio->iometa; + int pos = 0; + uint32_t ret; + + if (myio->hfd < 0) + return -1; + + pos += FILEMAGICLEN; + pos += SI4; + pos += SI8; + pos += SI8; + pos += SI8; + if (pread(myio->hfd, &ret, SI4, pos) != SI4) + return -1; + + return ret; +} + + +static int _vio_apnd2_setopensig(CDBVIO *vio, int sig) +{ + VIOAPND2 *myio = (VIOAPND2 *)vio->iometa; + int pos = 0; + uint32_t val = sig; + if (myio->hfd < 0) + return -1; + + pos += FILEMAGICLEN; + pos += SI4; + pos += SI8; + pos += SI8; + pos += SI8; + if (pwrite(myio->hfd, &val, SI4, pos) != SI4) + return -1; + return 0; +} + + diff --git a/libdap-cuttdb/src/vio_apnd2.h b/libdap-cuttdb/src/vio_apnd2.h new file mode 100644 index 0000000000000000000000000000000000000000..cb47a7dbd18a0f40ad1f9a571b35b94feaeff6d5 --- /dev/null +++ b/libdap-cuttdb/src/vio_apnd2.h @@ -0,0 +1,23 @@ +/* + * CuttDB - a fast key-value storage engine + * + * + * http://code.google.com/p/cuttdb/ + * + * Copyright (c) 2012, Siyuan Fu. All rights reserved. + * Use and distribution licensed under the BSD license. + * See the LICENSE file for full text + * + * Author: Siyuan Fu <fusiyuan2010@gmail.com> + * + */ + + +#ifndef _VIO_APND2_H_ +#define _VIO_APND2_H_ +#include "cdb_vio.h" + + +void vio_apnd2_init(CDBVIO *vio); + +#endif