/* We need to wait for the last thread to exit before we are able to * fork() in order to BGSAVE or BGREWRITEAOF. */ void waitEmptyIOJobsQueue(void) { while(1) { int io_processed_len; lockThreadedIO(); if (listLength(server.io_newjobs) == 0 && listLength(server.io_processing) == 0 && server.io_active_threads == 0) { unlockThreadedIO(); return; } /* While waiting for empty jobs queue condition we post-process some * finshed job, as I/O threads may be hanging trying to write against * the io_ready_pipe_write FD but there are so much pending jobs that * it's blocking. */ io_processed_len = listLength(server.io_processed); unlockThreadedIO(); if (io_processed_len) { vmThreadedIOCompletedJob(NULL,server.io_ready_pipe_read, (void*)0xdeadbeef,0); usleep(1000); /* 1 millisecond */ } else { usleep(10000); /* 10 milliseconds */ } } }
void *IOThreadEntryPoint(void *arg) { iojob *j; listNode *ln; REDIS_NOTUSED(arg); pthread_detach(pthread_self()); while(1) { /* Get a new job to process */ lockThreadedIO(); if (listLength(server.io_newjobs) == 0) { /* No new jobs in queue, exit. */ redisLog(REDIS_DEBUG,"Thread %ld exiting, nothing to do", (long) pthread_self()); server.io_active_threads--; unlockThreadedIO(); return NULL; } ln = listFirst(server.io_newjobs); j = ln->value; listDelNode(server.io_newjobs,ln); /* Add the job in the processing queue */ j->thread = pthread_self(); listAddNodeTail(server.io_processing,j); ln = listLast(server.io_processing); /* We use ln later to remove it */ unlockThreadedIO(); redisLog(REDIS_DEBUG,"Thread %ld got a new job (type %d): %p about key '%s'", (long) pthread_self(), j->type, (void*)j, (char*)j->key->ptr); /* Process the Job */ if (j->type == REDIS_IOJOB_LOAD) { vmpointer *vp = (vmpointer*)j->id; j->val = vmReadObjectFromSwap(j->page,vp->vtype); } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) { j->pages = rdbSavedObjectPages(j->val); } else if (j->type == REDIS_IOJOB_DO_SWAP) { if (vmWriteObjectOnSwap(j->val,j->page) == REDIS_ERR) j->canceled = 1; } /* Done: insert the job into the processed queue */ redisLog(REDIS_DEBUG,"Thread %ld completed the job: %p (key %s)", (long) pthread_self(), (void*)j, (char*)j->key->ptr); lockThreadedIO(); listDelNode(server.io_processing,ln); listAddNodeTail(server.io_processed,j); unlockThreadedIO(); /* Signal the main thread there is new stuff to process */ redisAssert(write(server.io_ready_pipe_write,"x",1) == 1); } return NULL; /* never reached */ }
int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db) { iojob *j; j = zmalloc(sizeof(*j)); j->type = REDIS_IOJOB_PREPARE_SWAP; j->db = db; j->key = key; incrRefCount(key); j->id = j->val = val; incrRefCount(val); j->canceled = 0; j->thread = (pthread_t) -1; val->storage = REDIS_VM_SWAPPING; lockThreadedIO(); queueIOJob(j); unlockThreadedIO(); return REDIS_OK; }
/* Remove the specified object from the threaded I/O queue if still not * processed, otherwise make sure to flag it as canceled. */ void vmCancelThreadedIOJob(robj *o) { list *lists[3] = { server.io_newjobs, /* 0 */ server.io_processing, /* 1 */ server.io_processed /* 2 */ }; int i; redisAssert(o->storage == REDIS_VM_LOADING || o->storage == REDIS_VM_SWAPPING); again: lockThreadedIO(); /* Search for a matching object in one of the queues */ for (i = 0; i < 3; i++) { listNode *ln; listIter li; listRewind(lists[i],&li); while ((ln = listNext(&li)) != NULL) { iojob *job = ln->value; if (job->canceled) continue; /* Skip this, already canceled. */ if (job->id == o) { redisLog(REDIS_DEBUG,"*** CANCELED %p (key %s) (type %d) (LIST ID %d)\n", (void*)job, (char*)job->key->ptr, job->type, i); /* Mark the pages as free since the swap didn't happened * or happened but is now discarded. */ if (i != 1 && job->type == REDIS_IOJOB_DO_SWAP) vmMarkPagesFree(job->page,job->pages); /* Cancel the job. It depends on the list the job is * living in. */ switch(i) { case 0: /* io_newjobs */ /* If the job was yet not processed the best thing to do * is to remove it from the queue at all */ freeIOJob(job); listDelNode(lists[i],ln); break; case 1: /* io_processing */ /* Oh Shi- the thread is messing with the Job: * * Probably it's accessing the object if this is a * PREPARE_SWAP or DO_SWAP job. * If it's a LOAD job it may be reading from disk and * if we don't wait for the job to terminate before to * cancel it, maybe in a few microseconds data can be * corrupted in this pages. So the short story is: * * Better to wait for the job to move into the * next queue (processed)... */ /* We try again and again until the job is completed. */ unlockThreadedIO(); /* But let's wait some time for the I/O thread * to finish with this job. After all this condition * should be very rare. */ usleep(1); goto again; case 2: /* io_processed */ /* The job was already processed, that's easy... * just mark it as canceled so that we'll ignore it * when processing completed jobs. */ job->canceled = 1; break; } /* Finally we have to adjust the storage type of the object * in order to "UNDO" the operaiton. */ if (o->storage == REDIS_VM_LOADING) o->storage = REDIS_VM_SWAPPED; else if (o->storage == REDIS_VM_SWAPPING) o->storage = REDIS_VM_MEMORY; unlockThreadedIO(); redisLog(REDIS_DEBUG,"*** DONE"); return; } } } unlockThreadedIO(); printf("Not found: %p\n", (void*)o); redisAssert(1 != 1); /* We should never reach this */ }
/* Every time a thread finished a Job, it writes a byte into the write side * of an unix pipe in order to "awake" the main thread, and this function * is called. * * Note that this is called both by the event loop, when a I/O thread * sends a byte in the notification pipe, and is also directly called from * waitEmptyIOJobsQueue(). * * In the latter case we don't want to swap more, so we use the * "privdata" argument setting it to a not NULL value to signal this * condition. */ void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata, int mask) { char buf[1]; int retval, processed = 0, toprocess = -1, trytoswap = 1; REDIS_NOTUSED(el); REDIS_NOTUSED(mask); REDIS_NOTUSED(privdata); if (privdata != NULL) trytoswap = 0; /* check the comments above... */ /* For every byte we read in the read side of the pipe, there is one * I/O job completed to process. */ #ifndef _WIN32 while((retval = read(fd,buf,1)) == 1) { #else DWORD pipe_is_on = 0; while (1) { retval = 0; /*Windows fix: We need to peek pipe, since read would block. */ if (!PeekNamedPipe((HANDLE) _get_osfhandle(fd), NULL, 0, NULL, &pipe_is_on, NULL)) { redisLog(REDIS_DEBUG,"PeekReadPipe failed %s", strerror(GetLastError())); break; } /* No data on pipe */ if (!pipe_is_on) break; if ((retval = read(fd,buf,1)) != 1) break; #endif iojob *j; listNode *ln; struct dictEntry *de; /* Get the processed element (the oldest one) */ lockThreadedIO(); redisLog(REDIS_DEBUG,"Processing I/O completed job"); redisAssert(listLength(server.io_processed) != 0); if (toprocess == -1) { toprocess = (listLength(server.io_processed)*REDIS_MAX_COMPLETED_JOBS_PROCESSED)/100; if (toprocess <= 0) toprocess = 1; } ln = listFirst(server.io_processed); j = ln->value; listDelNode(server.io_processed,ln); unlockThreadedIO(); /* If this job is marked as canceled, just ignore it */ if (j->canceled) { freeIOJob(j); continue; } /* Post process it in the main thread, as there are things we * can do just here to avoid race conditions and/or invasive locks */ redisLog(REDIS_DEBUG,"COMPLETED Job type: %d, ID %p, key: %s", j->type, (void*)j->id, (unsigned char*)j->key->ptr); de = dictFind(j->db->dict,j->key->ptr); redisAssert(de != NULL); if (j->type == REDIS_IOJOB_LOAD) { redisDb *db; vmpointer *vp = dictGetEntryVal(de); /* Key loaded, bring it at home */ vmMarkPagesFree(vp->page,vp->usedpages); redisLog(REDIS_DEBUG, "VM: object %s loaded from disk (threaded)", (unsigned char*) j->key->ptr); server.vm_stats_swapped_objects--; server.vm_stats_swapins++; dictGetEntryVal(de) = j->val; incrRefCount(j->val); db = j->db; /* Handle clients waiting for this key to be loaded. */ handleClientsBlockedOnSwappedKey(db,j->key); freeIOJob(j); zfree(vp); } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) { /* Now we know the amount of pages required to swap this object. * Let's find some space for it, and queue this task again * rebranded as REDIS_IOJOB_DO_SWAP. */ if (!vmCanSwapOut() || vmFindContiguousPages(&j->page,j->pages) == REDIS_ERR) { /* Ooops... no space or we can't swap as there is * a fork()ed Redis trying to save stuff on disk. */ j->val->storage = REDIS_VM_MEMORY; /* undo operation */ freeIOJob(j); } else { /* Note that we need to mark this pages as used now, * if the job will be canceled, we'll mark them as freed * again. */ vmMarkPagesUsed(j->page,j->pages); j->type = REDIS_IOJOB_DO_SWAP; lockThreadedIO(); queueIOJob(j); unlockThreadedIO(); } } else if (j->type == REDIS_IOJOB_DO_SWAP) { vmpointer *vp; /* Key swapped. We can finally free some memory. */ if (j->val->storage != REDIS_VM_SWAPPING) { vmpointer *vp = (vmpointer*) j->id; printf("storage: %d\n",vp->storage); printf("key->name: %s\n",(char*)j->key->ptr); printf("val: %p\n",(void*)j->val); printf("val->type: %d\n",j->val->type); printf("val->ptr: %s\n",(char*)j->val->ptr); } redisAssert(j->val->storage == REDIS_VM_SWAPPING); vp = createVmPointer(j->val); vp->page = j->page; vp->usedpages = j->pages; dictGetEntryVal(de) = vp; /* Fix the storage otherwise decrRefCount will attempt to * remove the associated I/O job */ j->val->storage = REDIS_VM_MEMORY; decrRefCount(j->val); redisLog(REDIS_DEBUG, "VM: object %s swapped out at %lld (%lld pages) (threaded)", (unsigned char*) j->key->ptr, (unsigned long long) j->page, (unsigned long long) j->pages); server.vm_stats_swapped_objects++; server.vm_stats_swapouts++; freeIOJob(j); /* Put a few more swap requests in queue if we are still * out of memory */ if (trytoswap && vmCanSwapOut() && zmalloc_used_memory() > server.vm_max_memory) { int more = 1; while(more) { lockThreadedIO(); more = listLength(server.io_newjobs) < (unsigned) server.vm_max_threads; unlockThreadedIO(); /* Don't waste CPU time if swappable objects are rare. */ if (vmSwapOneObjectThreaded() == REDIS_ERR) { trytoswap = 0; break; } } } } processed++; if (processed == toprocess) return; } if (retval < 0 && errno != EAGAIN) { redisLog(REDIS_WARNING, "WARNING: read(2) error in vmThreadedIOCompletedJob() %s", strerror(errno)); } } void lockThreadedIO(void) { pthread_mutex_lock(&server.io_mutex); }
/* This function makes the clinet 'c' waiting for the key 'key' to be loaded. * If there is not already a job loading the key, it is craeted. * The key is added to the io_keys list in the client structure, and also * in the hash table mapping swapped keys to waiting clients, that is, * server.io_waited_keys. */ int waitForSwappedKey(redisClient *c, robj *key) { struct dictEntry *de; robj *o; list *l; /* If the key does not exist or is already in RAM we don't need to * block the client at all. */ de = dictFind(c->db->dict,key->ptr); if (de == NULL) return 0; o = dictGetEntryVal(de); if (o->storage == REDIS_VM_MEMORY) { return 0; } else if (o->storage == REDIS_VM_SWAPPING) { /* We were swapping the key, undo it! */ vmCancelThreadedIOJob(o); return 0; } /* OK: the key is either swapped, or being loaded just now. */ /* Add the key to the list of keys this client is waiting for. * This maps clients to keys they are waiting for. */ listAddNodeTail(c->io_keys,key); incrRefCount(key); /* Add the client to the swapped keys => clients waiting map. */ de = dictFind(c->db->io_keys,key); if (de == NULL) { int retval; /* For every key we take a list of clients blocked for it */ l = listCreate(); retval = dictAdd(c->db->io_keys,key,l); incrRefCount(key); redisAssert(retval == DICT_OK); } else { l = dictGetEntryVal(de); } listAddNodeTail(l,c); /* Are we already loading the key from disk? If not create a job */ if (o->storage == REDIS_VM_SWAPPED) { iojob *j; vmpointer *vp = (vmpointer*)o; o->storage = REDIS_VM_LOADING; j = zmalloc(sizeof(*j)); j->type = REDIS_IOJOB_LOAD; j->db = c->db; j->id = (robj*)vp; j->key = key; incrRefCount(key); j->page = vp->page; j->val = NULL; j->canceled = 0; j->thread = (pthread_t) -1; lockThreadedIO(); queueIOJob(j); unlockThreadedIO(); } return 1; }