예제 #1
0
파일: synQueue.c 프로젝트: davidhoover/kent
void synQueuePut(struct synQueue *sq, void *message)
/* Add message to end of queue. */
{
pthreadMutexLock(&sq->mutex);
dlAddValTail(sq->queue, message);
pthreadCondSignal(&sq->cond);
pthreadMutexUnlock(&sq->mutex);
}
예제 #2
0
static struct dlList *sortedListFromTree(struct rbTree *tree)
/* Create a double-linked list from tree. List will be sorted.  */
{
struct slRef *ref, *refList = rbTreeItems(tree);
struct dlList *list = dlListNew();
for (ref = refList; ref != NULL; ref = ref->next)
    dlAddValTail(list, ref->val);
slFreeList(&refList);
return list;
}
예제 #3
0
파일: letterChain.c 프로젝트: bowhan/kent
void letterChain(char *inFile, char *outFile, int maxSize)
/* letterChain - Make Markov chain of letters in text. */
{
struct dlList *ll = dlListNew();
int llSize = 0;
int c;
FILE *in = mustOpen(inFile, "r");
FILE *out;
struct dlNode *node;
UBYTE *s;
struct trie *trie;

AllocVar(trie);

while ((c = getc(in)) >= 0)
    {
    if (llSize < maxSize)
        {
	s = needMem(1);
	*s = c;
	dlAddValTail(ll, s);
	++llSize;
	if (llSize == maxSize)
	    addToTrie(trie, ll);
	}
    else
        {
	node = dlPopHead(ll);
	s = node->val;
	*s = c;
	dlAddTail(ll, node);
	addToTrie(trie, ll);
	}
    }
if (llSize < maxSize)
    addToTrie(trie, ll);
while ((node = dlPopHead(ll)) != NULL)
    {
    addToTrie(trie, ll);
    freeMem(node->val);
    freeMem(node);
    }
dlListFree(&ll);
carefulClose(&in);

out = mustOpen(outFile, "w");
rDumpTrie(0, trie->useCount, trie, out);

carefulClose(&out);
}
예제 #4
0
파일: genieCon.c 프로젝트: bowhan/kent
void makeEntities(struct clonePair *pairList, struct dlList **entLists)
/* Lump pairs of cDNAs into entities based on them having overlapping
 * and compatable cDNAs. */
{
struct dlList *chromEntList;
struct entity *compatableList, *entity;
struct clonePair *pair;
struct dlNode *node;
int pairCount = 0;

for (pair = pairList; pair != NULL; pair = pair->next)
    {
    if (++pairCount % 1000 == 0)
        printf("Processing pair %d\n", pairCount);
    chromEntList = entLists[pair->chromIx];
    if ((compatableList = findCompatableEntities(chromEntList, pair)) != NULL)
        {
        compatableList = addToEntityList(compatableList, pair->p3);
        compatableList = addToEntityList(compatableList, pair->p5);
        mergeEntities(compatableList);
        }
    else
        {
        if (pair->p5)
            {
            entity = newEntity(pair->p5);
            if (pair->p3)
                {
                if (isCompatable(entity, pair->p3)) /* There are a few rare cases
                                                     * where this isn't true. */
                    {
                    entity = addToEntityList(entity, pair->p3);
                    entity = mergeEntities(entity);
                    }
                }
            }
        else
            {
            entity = newEntity(pair->p3);
            }
        node = dlAddValTail(chromEntList, entity);
        entity->node = node;
        }
    }
}
예제 #5
0
struct dgNodeRef *dgConstrainedPriorityOrder(struct diGraph *dg)
/* Return traversal of graph in priority order subject to
 * constraint that all parents must be output before
 * their children regardless of node priority. 
 * Graph must be cycle free. */
{
struct dlList *sortedList = newDlList();
struct dgNode *graphNode;
struct dlNode *listNode;
struct dgNodeRef *refList = NULL, *ref;

if (dgHasCycles(dg))
    errAbort("Call to dgConstrainedPriorityOrder on graph with cycles.");

/* Make up list sorted by priority. */
for (graphNode = dg->nodeList; graphNode != NULL; graphNode = graphNode->next)
    {
    dlAddValTail(sortedList, graphNode);
    graphNode->visited = FALSE;
    }
dlSort(sortedList, cmpPriority);

/* Loop taking first member of list with no untraversed parents. */
while (!dlEmpty(sortedList))
    {
    for (listNode = sortedList->head; listNode->next != NULL; listNode = listNode->next)
	{
	graphNode = listNode->val;
	if (dgParentsAllVisited(graphNode))
	    {
	    dlRemove(listNode);
	    freeMem(listNode);
	    AllocVar(ref);
	    ref->node = graphNode;
	    slAddHead(&refList, ref);
	    graphNode->visited = TRUE;
	    break;
	    }
	}
    }
freeDlList(&sortedList);
slReverse(&refList);
return refList;
}
예제 #6
0
struct dgEdge *dgConnectWithVal(struct diGraph *dg, struct dgNode *a, 
     struct dgNode *b, void *val)
/* Connect node a to node b and put val on edge.  An error to
 * reconnect with a different val. */
{
struct dgConnection *con;
struct dgEdge *edge;
struct dlNode *edgeOnList;

/* Check to see if it's already there. */
if ((con = dgFindNodeInConList(a->nextList, b)) != NULL)
    {
    edge =  con->edgeOnList->val;
    if (val != edge->val)
        warn("Trying to add new value to edge between %s and %s, ignoring",
	   a->name, b->name);
    return edge;
    }
/* Allocate edge and put on list. */
AllocVar(edge);
edge->a = a;
edge->b = b;
edge->val = val;
edgeOnList = dlAddValTail(dg->edgeList, edge);

/* Connect nodes to each other. */
AllocVar(con);
con->node = b;
con->edgeOnList = edgeOnList;
slAddHead(&a->nextList, con);
AllocVar(con);
con->node = a;
con->edgeOnList = edgeOnList;
slAddHead(&b->prevList, con);

return edge;
}
예제 #7
0
파일: synteny.c 프로젝트: davidhoover/kent
void figureSynteny(char *inName, FILE *out)
/* Figure out synteny stats - how much in a row aligns. */
{
FILE *in;
char line[512];
int lineCount = 0;
char *words[64];
int wordCount;
char *firstWord;
char *queryName = "";
struct contig *contig;
struct dlNode *contigNode;
struct dlList *contigList = newDlList();
int lineState = 0;   /* Keeps track of groups of four lines. */
struct slName *queryNameList = NULL;
int maxSymCount = 64*1024;
char *qSymBuf = needMem(maxSymCount+1);
char *tSymBuf = needMem(maxSymCount+1);
char *hSymBuf = needMem(maxSymCount+1);
int symCount = 0;
int qSymLen, tSymLen, hSymLen;
int bestSegScore;
int lastQoff = -1;
int i;

in = mustOpen(inName, "r");
while (fgets(line, sizeof(line), in))
    {
    ++lineCount;
    if ((lineCount%100000) == 0)
        {
        printf("Processing line %d of %s\n", lineCount, inName);
        }
    if (++lineState == 5)
        lineState = 0;
    wordCount = chopLine(line, words);
    if (wordCount <= 0)
        continue;
    firstWord = words[0];
    if (sameString(firstWord, "Aligning"))
        {
        char *queryString;
        char *targetString;
        char queryStrand, targetStrand;
        char *parts[8];
        int partCount;

        /* Do some preliminary checking of this line. */
        if (wordCount < 6)
            errAbort("Short line %d of %s", lineCount, inName);
        queryString = words[1];
        queryStrand = words[2][0];
        targetString = words[4];
        targetStrand = words[5][0];

        /* Extract the name of the query sequence.  If it's new,
         * then write out contigs on previous query we've accumulated
         * so far and start a new list. */
        partCount = chopString(queryString, ":-", parts, ArraySize(parts));
        if (!sameString(parts[0], queryName))
            {
            /* Allocate new name and keep track of it. */
            struct slName *newName = newSlName(parts[0]);
            slAddHead(&queryNameList, newName);

            /* Set last Segment for this clone to impossible val. */
            bestSegScore = -0x3fffffff;
            lastQoff = -1;

            /* Write out old contigs and empty out contig list. */
            syntenyOnClone(queryName, contigList, out);
            freeContigList(&contigList);
            contigList = newDlList();
            queryName = newName->name;
            }
        
        /* Make up a new contig, and fill it in with the data we
         * have so far about query. */
        AllocVar(contig);
        contig->query = queryName;
        contig->qOffset = atoi(parts[1]);
        contig->qEndOffset = atoi(parts[2]);
        contig->qStrand = queryStrand;

        if (lastQoff != contig->qOffset)
            {
            lastQoff = contig->qOffset;
            bestSegScore = -0x3fffffff;
            }
        /* Parse target string and fill in contig with it's info. */
        chopString(targetString, ":-", parts, ArraySize(parts));
        contig->target = cloneString(parts[0]);
        contig->tOffset = atoi(parts[1]);
        contig->tEndOffset = atoi(parts[2]);
        contig->tStrand = targetStrand;

        /* We don't know start and end yet - set them to values
         * that will get easily replace by max/min. */
        contig->qStart = contig->tStart = 0x3fffffff;

        lineState = -1;
        symCount = 0;
        }
    else if (sameString(firstWord, "best"))
        {
        if (wordCount < 3)
            errAbort("Short line %d of %s", lineCount, inName);
        contig->score = atoi(words[2]);
        if (contig->score > bestSegScore && contig->score >= minScore)
            {
            struct dlNode *tailNode;
            struct contig *tailContig;
            bestSegScore = contig->score;
            contig->isComplete = TRUE;
            contig->qSym = cloneStringZ(qSymBuf, symCount);
            contig->tSym = cloneStringZ(tSymBuf, symCount);
            contig->hSym = cloneStringZ(hSymBuf, symCount);
            contig->symCount = symCount;
            contig->qEnd = contig->qStart + countNonGap(qSymBuf, symCount);
            contig->tEnd = contig->tStart + countNonGap(tSymBuf, symCount);
            tailNode = contigList->tail;
            if (tailNode != NULL)
                {
                tailContig = tailNode->val;
                if (tailContig->qOffset == contig->qOffset)
                    {
                    freeContig(&tailContig);
                    dlRemove(tailNode);
                    freeMem(tailNode);
                    }
                }
            contigNode = dlAddValTail(contigList, contig);
            }
        }
    else if (wordCount > 1 && isdigit(firstWord[0]) || firstWord[0] == '-')
        {
        int start, end;
        char *sym = words[1];
        int symLen = strlen(sym);
        char firstChar = firstWord[0];
        if (lineState != 0 && lineState != 2)
            errAbort("Bummer - phasing mismatch on lineState line %d of %s!\n", lineCount, inName);
        assert(lineState == 0 || lineState == 2);
        start = atoi(firstWord);
        end = start + symLen;
        if (symCount + symLen > maxSymCount)
            {
            errAbort("Single contig too long line %d of %s, can only handle up to %d symbols\n",
                lineCount, inName, maxSymCount);
            }
        if (lineState == 0) /* query symbols */
            {
            qSymLen = symLen;
            if (isdigit(firstChar))
                {
                start += contig->qOffset;
                end += contig->qOffset;
                contig->qStart = min(contig->qStart, start);
                contig->qEnd = max(contig->qEnd, end);
                }
            memcpy(qSymBuf+symCount, sym, symLen);
            }
        else               /* target symbols */
            {
            tSymLen = symLen;
            if (tSymLen != qSymLen)
                {
                errAbort("Target symbol size not same as query line %d of %s",
                    lineCount, inName);
                }            
            if (isdigit(firstChar))
                {
                start += contig->tOffset;
                end += contig->tOffset;
                contig->tStart = min(contig->tStart, start);
                }
            memcpy(tSymBuf+symCount, sym, symLen);
            }
        }
    else if (firstWord[0] == '(')
        {
        lineState = -1;
        }
    else
        {
        assert(lineState == 1 || lineState == 3);
        if (lineState == 3)  /* Hidden symbols. */
            {
            char *sym = firstWord;
            int symLen = strlen(sym);
            hSymLen = symLen;
            if (hSymLen != qSymLen)
                {
                errAbort("Hidden symbol size not same as query line %d of %s",
                    lineCount, inName);
                }
            memcpy(hSymBuf+symCount, sym, symLen);
            symCount += symLen;
            }        
        }
    } 
syntenyOnClone(queryName, contigList, out);
freeContigList(&contigList);
fclose(in);

slFreeList(&queryNameList);
freeMem(qSymBuf);
freeMem(tSymBuf);
freeMem(hSymBuf);

fprintf(out, "CloneSegCounts[] = \n");
for (i=0; i<ArraySize(cloneSegCounts); ++i)
    fprintf(out, "%d %d\n", i, cloneSegCounts[i]);
fprintf(out, "\n");

fprintf(out, "kCounts[] = \n");
for (i=0; i<ArraySize(kCounts); ++i)
    fprintf(out, "%d %d\n", i, kCounts[i]);

segAverageSize = round((double)segTotalSize/segCount);
fprintf(out, "\n%d Segments, average size %d\n", segCount, segAverageSize);

}
예제 #8
0
파일: synQueue.c 프로젝트: davidhoover/kent
void synQueuePutUnprotected(struct synQueue *sq, void *message)
/* Add message to end of queue without protecting against multithreading
 * contention - used before pthreads are launched perhaps. */
{
dlAddValTail(sq->queue, message);
}
예제 #9
0
struct dgNodeRef *dgFindPath(struct diGraph *dg, struct dgNode *a, struct dgNode *b)
/* Find shortest path from a to b.  Return NULL if can't be found. */
{
struct dgNodeRef *refList  = NULL, *ref;
struct dgConnection *con;
struct dgNode *node, *nNode;
struct dlList *fifo;
struct dlNode *ffNode;
struct dgNode endNode;
int fifoSize = 1;

/* Do some quick and easy tests first to return if have no way out
 * of node A, or if B directly follows A. */
if (a->nextList == NULL)
    return NULL;
if (a == b)
    {
    AllocVar(ref);
    ref->node = a;
    return ref;
    }
if ((con = dgFindNodeInConList(a->nextList, b)) != NULL)
    {
    AllocVar(refList);
    refList->node = a;
    node = con->node;
    AllocVar(ref);
    ref->node = node;
    slAddTail(&refList, ref);
    return refList;
    }

/* Set up for breadth first traversal.  Will use a doubly linked
 * list as a fifo. */
for (node = dg->nodeList; node != NULL; node = node->next)
    node->tempEntry = NULL;
fifo = newDlList();
dlAddValTail(fifo, a);
a->tempEntry = &endNode;

while ((ffNode = dlPopHead(fifo)) != NULL)
    {
    --fifoSize;
    node = ffNode->val;
    freeMem(ffNode);
    for (con = node->nextList; con != NULL; con = con->next)
	{
	nNode = con->node;
	if (nNode->tempEntry == NULL)
	    {
	    nNode->tempEntry = node;
	    if (nNode == b)
		{
		while (nNode != &endNode && nNode != NULL)
		    {
		    AllocVar(ref);
		    ref->node = nNode;
		    slAddHead(&refList, ref);
		    nNode = nNode->tempEntry;
		    }
		break;
		}
	    else
		{
		dlAddValTail(fifo, nNode);
		++fifoSize;
		if (fifoSize > 100000)
		    errAbort("Internal error in dgFindPath");
		}
	    }
	}
    }
freeDlList(&fifo);
return refList;
}
void doRun(char *line, struct sockaddr_in *hubIp)
/* Execute command. */
{
char *jobMessage = cloneString(line);
static char *args[1024];
int argCount;
char hubDottedQuad[17];

nextRandom();
if (line == NULL)
    warn("Executing nothing...");
else if (!internetIpToDottedQuad(ntohl(hubIp->sin_addr.s_addr), hubDottedQuad))
    warn("Can't convert ipToDottedQuad");
else
    {
    struct runJobMessage rjm;
    if (parseRunJobMessage(line, &rjm))
	{
	int jobId = atoi(rjm.jobIdString);
	if (findRunningJob(jobId) == NULL && findFinishedJob(jobId) == NULL)
	    {
	    if (busyProcs < maxProcs)
		{
		int childPid;
		argCount = chopLine(rjm.command, args);
		if (argCount >= ArraySize(args))
		    warn("Too many arguments to run");
		else
		    {
		    args[argCount] = NULL;
		    if ((childPid = forkOrDie()) == 0)
			{
			/* Do JOB_ID substitutions */
			struct subText *st = subTextNew("$JOB_ID", rjm.jobIdString);
			int i;
			rjm.in = subTextString(st, rjm.in);
			rjm.out = subTextString(st, rjm.out);
			rjm.err = subTextString(st, rjm.err);
			for (i=0; i<argCount; ++i)
			    args[i] = subTextString(st, args[i]);

			execProc(hubDottedQuad, rjm.jobIdString, rjm.reserved,
			    rjm.user, rjm.dir, rjm.in, rjm.out, rjm.err, rjm.ram,
			    args[0], args);
			exit(0);
			}
		    else
			{
			struct job *job;
			AllocVar(job);
			job->jobId = atoi(rjm.jobIdString);
			job->pid = childPid;
			job->startMessage = jobMessage;
			jobMessage = NULL;	/* No longer own memory. */
			job->node = dlAddValTail(jobsRunning, job);
			++busyProcs;
			}
		    }
		}
	    else
		{
		warn("Trying to run when busy.");
		}
	    }
	else
	    {
	    warn("Duplicate run-job %d\n", jobId);
	    }
	}
    }
freez(&jobMessage);
}
예제 #11
0
파일: ccCp.c 프로젝트: sktu/kentUtils
void ccCp(char *source, char *dest, char *hostList)
/* Copy source to dest on all files in hostList. */
{
time_t startTime = time(NULL);
time_t curTime, lastTime = 0;
struct machine *machineList = NULL;
struct netSwitch *nsList;
struct machine *m, *m2;
struct dlList *toDoList = newDlList();          /* We haven't done these. */
struct dlList *finishedList = newDlList();	/* All done here. */
struct dlList *sourceList = newDlList();        /* These are sources for copies. */
struct dlList *workingList = newDlList();       /* These are copying data to themselves. */
struct dlList *errList = newDlList();           /* These are messed up 3x or more. */
bool firstOk = FALSE;
struct dlNode *finNode, *node, *sourceNode, *destNode;
struct dyString *cmd = newDyString(256);
int machineCount;
int machinesFinished = 0;
char *thisHost = getenv("HOST");
off_t size;
int goodMachines;
double grandTotal;

/* Get host and switch info. */
readHosts(hostList, &machineList, &nsList);
machineCount = slCount(machineList);

/* Make sure file exists.... */
if (!fileExists(source))
    errAbort("%s doesn't exist\n", source);
size = fileSize(source);
printf("Copying %s (%lld bytes) to %d machines\n", source, (unsigned long long)size, machineCount);

/* Add everything to the to-do list. */
for (m = machineList; m != NULL; m = m->next)
    {
    dlAddValTail(toDoList, m);
    }


/* Loop through to-do list trying to do first copy. */
for (node = toDoList->head; node->next != NULL; node = node->next)
    {
    m = node->val;
    dyStringClear(cmd);
    m = node->val;
    if (sameString(thisHost, m->name))
	{
	if (sameString(source, dest))
	    {
	    /* Hey, this is too easy. */
	    firstOk = TRUE;
	    ++machinesFinished;
	    break;
	    }
	else
	    {
	    dyStringPrintf(cmd, "cp %s %s", source, dest);
	    }
	}
    else
	{
	dyStringPrintf(cmd, "rcp %s %s:%s", source, m->name, dest);
	}
    if (system(cmd->string) == 0)
	{
	dlRemove(node);
	dlAddTail(finishedList, node);
	firstOk = TRUE;
	++machinesFinished;
	break;
	}
    else  /* some error in rcp */
	{
	warn("Problem with %s\n", cmd->string);
	m->errCount += 1;
	}
    }

/* Loop around launching child processes to copy and
 * wait for them to finish. */
while (machinesFinished < machineCount)
    {
    int pid;
    int status;

    /* Start all possible copies. */
    while (matchMaker(finishedList, toDoList, &sourceNode, &destNode))
	{
	dlAddTail(sourceList, sourceNode);
	dlAddTail(workingList, destNode);
	m = destNode->val;
	m->sourceNode = sourceNode;
	startCopy(sourceNode->val, destNode->val, dest, thisHost, cmd);
	}

    curTime = time(NULL);
    if (curTime - lastTime >= 3)
	{
	printf("%d finished in %d seconds, %d in progress, %d to start, %d errors, %d total\n",
	    dlCount(finishedList) + dlCount(sourceList), (int)(curTime - startTime),
	    dlCount(workingList), dlCount(toDoList), dlCount(errList), machineCount);
	lastTime = curTime;
	}

    /* Wait for a child to finish.  Figure out which machine it is. */
    pid = wait(&status);
    finNode = NULL;
    for (node = workingList->head; node->next != NULL; node = node->next)
	{
	m = node->val;
	if (m->pid == pid)
	    {
	    finNode = node;
	    break;
	    }
	}
    if (finNode == NULL)
	{
	errAbort("Returned from wait on unknown child %d\n", pid);
	continue;
	}

    m = finNode->val;
    m->pid = 0;
    dlRemove(finNode);
    dlRemove(m->sourceNode);
    m2 = m->sourceNode->val;
    if (m->netSwitch != m2->netSwitch)
	--crossSwitchCount;
    dlAddTail(finishedList, m->sourceNode);

    if (WIFEXITED(status) && WEXITSTATUS(status) == 0)
	{
	/* Good return - move self and source node to finished list. */
	++machinesFinished;
	dlAddTail(finishedList, finNode);
	}
    else
	{
	/* Bad return.  Increment error count, and maybe move it to
	 * error list. */
	if (++m->errCount >= maxErrCount)
	    {
	    ++machinesFinished;
	    dlAddTail(errList, finNode);
	    fprintf(stderr, "Gave up on %s\n", m->name);
	    }
	else
	    {
	    dlAddMiddle(toDoList, finNode);
	    fprintf(stderr, "Retry %d on %s\n", m->errCount, m->name);
	    }
	}
    }
if (!dlEmpty(errList))
    {
    fprintf(stderr, "errors in:");
    for (node = errList->head; node->next != NULL; node = node->next)
	{
	m = node->val;
	fprintf(stderr, " %s", m->name);
	}
    fprintf(stderr, "\n");
    }
goodMachines = dlCount(finishedList);
grandTotal = (double)goodMachines * (double)size;
printf("Copied to %d of %d machines (grand total %e bytes) in %d seconds\n", 
	goodMachines, machineCount, grandTotal, (int)(time(NULL) - startTime));
}
예제 #12
0
파일: wordChain.c 프로젝트: bowhan/kent
struct wordTree *wordTreeForChainsInFile(char *fileName, int chainSize, struct lm *lm)
/* Return a wordTree of all chains-of-words of length chainSize seen in file. 
 * Allocate the structure in local memory pool lm. */ 
{
/* Stuff for processing file a line at a time. */
struct lineFile *lf = lineFileOpen(fileName, TRUE);
char *line, *word;

/* We'll keep a chain of three or so words in a doubly linked list. */
struct dlNode *node;
struct dlList *chain = dlListNew();
int curSize = 0;

/* We'll build up the tree starting with an empty root node. */
struct wordTree *wt = wordTreeNew("");	
int wordCount = 0;

/* Save time/space by sharing stack between all "following" rbTrees. */
struct rbTreeNode **stack;	
lmAllocArray(lm, stack, 256);

/* Loop through each line of input file, lowercasing the whole line, and then
 * looping through each word of line, stripping out special chars, and finally
 * processing each word. */
while (lineFileNext(lf, &line, NULL))
    {
    if (lower)
        tolowers(line);
    while ((word = nextWord(&line)) != NULL)
	{
	if (unpunc)
	    {
	    stripChar(word, ',');
	    stripChar(word, '.');
	    stripChar(word, ';');
	    stripChar(word, '-');
	    stripChar(word, '"');
	    stripChar(word, '?');
	    stripChar(word, '!');
	    stripChar(word, '(');
	    stripChar(word, ')');
	    if (word[0] == 0)
	         continue;
	    }
	verbose(2, "%s\n", word);

	/* We come to this point in the code for each word in the file. 
	 * Here we want to maintain a chain of sequential words up to
	 * chainSize long.  We do this with a doubly-linked list structure.
	 * For the first few words in the file we'll just build up the list,
	 * only adding it to the tree when we finally do get to the desired
	 * chain size.  Once past the initial section of the file we'll be
	 * getting rid of the first link in the chain as well as adding a new
	 * last link in the chain with each new word we see. */
	if (curSize < chainSize)
	    {
	    dlAddValTail(chain, cloneString(word));
	    ++curSize;
	    if (curSize == chainSize)
		addChainToTree(wt, chain, lm, stack);
	    }
	else
	    {
	    /* Reuse doubly-linked-list node, but give it a new value, as we move
	     * it from head to tail of list. */
	    node = dlPopHead(chain);
	    freeMem(node->val);
	    node->val = cloneString(word);
	    dlAddTail(chain, node);
	    addChainToTree(wt, chain, lm, stack);
	    }
	++wordCount;
	}
    }

/* Handle last few words in file, where can't make a chain of full size.  Need
 * a special case for file that has fewer than chain size words too. */
if (curSize < chainSize)
    addChainToTree(wt, chain, lm, stack);
while ((node = dlPopHead(chain)) != NULL)
    {
    addChainToTree(wt, chain, lm, stack);
    freeMem(node->val);
    freeMem(node);
    }
dlListFree(&chain);
lineFileClose(&lf);
return wt;
}
예제 #13
0
struct wordStore *wordStoreForChainsInFile(char *fileName, int chainSize)
/* Return a wordStore containing all words, and also all chains-of-words of length 
 * chainSize seen in file.  */
{
/* Stuff for processing file a line at a time. */
struct lineFile *lf = lineFileOpen(fileName, TRUE);
char *line, *word;

/* We'll build up the tree starting with an empty root node. */
struct wordStore *store = wordStoreNew(chainSize);
struct wordTree *wt = store->markovChains = wordTreeNew(wordStoreAdd(store, ""));	

/* Loop through each line of file, treating it as a separate read. There's 
 * special cases at the beginning and end of line, and for short lines.  In the
 * main case we'll be maintaining a chain (doubly linked list) of maxChainSize words, 
 * popping off one word from the start, and adding one word to the end for each
 * new word we encounter. This list is added to the tree each iteration. */
while (lineFileNext(lf, &line, NULL))
    {
    /* We'll keep a chain of three or so words in a doubly linked list. */
    struct dlNode *node;
    struct dlList *chain = dlListNew();
    int curSize = 0;
    int wordCount = 0;

    /* skipping the first word which is the read id */
    word = nextWord(&line);

    while ((word = nextWord(&line)) != NULL)
	{
	struct wordInfo *info = wordStoreAdd(store, word);
	 /* For the first few words in the file after ID, we'll just build up the chain,
	 * only adding it to the tree when we finally do get to the desired
	 * chain size.  Once past the initial section of the file we'll be
	 * getting rid of the first link in the chain as well as adding a new
	 * last link in the chain with each new word we see. */
	if (curSize < chainSize)
	    {
	    dlAddValTail(chain, info);
	    ++curSize;
	    if (curSize == chainSize)
		addChainToTree(wt, chain);
	    }
	else
	    {
	    /* Reuse doubly-linked-list node, but give it a new value, as we move
	     * it from head to tail of list. */
	    node = dlPopHead(chain);
	    node->val = info;
	    dlAddTail(chain, node);
	    addChainToTree(wt, chain);
	    }
	++wordCount;
	}
    /* Handle last few words in line, where can't make a chain of full size.  Also handles       
    * lines that have fewer than chain size words. */
    if (curSize < chainSize)
 	addChainToTree(wt, chain);
    while ((node = dlPopHead(chain)) != NULL)
	{
	if (!dlEmpty(chain))
	    addChainToTree(wt, chain);
	freeMem(node);
	}
    dlListFree(&chain);
    }
lineFileClose(&lf);

wordTreeSort(wt);  // Make output of chain file prettier
return store;
}