Пример #1
0
void recbld(void)
{
	int i;
	unsigned char *r, *p;

	if (donerec == 1)
		return;
	r = recdata;
	for (i = 1; i <= *NF; i++) {
		p = getsval(fldtab[i]);
		while ((*r = *p++)) {
			if (++r >= &recdata[recsize]) {
				recsize += CHUNK;
				growrec(&recdata, &recsize, recsize, &r, 1);
			}
		}
		if (i < *NF)
			for ((p = *OFS); (*r = *p++); ) {
				if (++r >= &recdata[recsize]) {
					recsize += CHUNK;
					growrec(&recdata, &recsize,
							recsize, &r, 1);
				}
			}
	}
	*r = '\0';
	dprintf( ("in recbld FS=%o, recloc=%lo\n", **FS, 
		(long)recloc) );
	recloc->tval = REC | STR | DONTFREE;
	recloc->sval = record = recdata;
	dprintf( ("in recbld FS=%o, recloc=%lo\n", **FS, 
		(long)recloc) );
	dprintf( ("recbld = |%s|\n", record) );
	donerec = 1;
}
Пример #2
0
int readrec(unsigned char **buf, int *bufsize, FILE *inf)
	/* read one record into buf */
{
	register int sep, c, k, m, n;
	unsigned char *rr;
	register int nrr;
	wchar_t wc;

	next(wc, *RS, n);
	if ((sep = **RS) == 0) {
		sep = '\n';
		while ((c=getc(inf)) == '\n' && c != EOF)	/* skip leading \n's */
			;
		if (c != EOF)
			ungetc(c, inf);
	}
	if (*bufsize == 0)
		growrec(buf, bufsize, CHUNK, NULL, 0);
	for (rr = *buf, nrr = *bufsize; ; ) {
	cont:	for (; (c=getc(inf)) != sep && c != EOF; *rr++ = c)
			if (--nrr < n + 3) {
				growrec(buf, bufsize, *bufsize + CHUNK, &rr, 0);
				nrr += CHUNK;
			}
		if (c != EOF) {
			/*
			 * Note: This code does not restrict occurences of
			 * the multibyte sequence in RS to the start of an
			 * input character.
			 */
			for (m = 1; m < n; m++) {
				if ((c = getc(inf)) == EOF || c != (*RS)[m]) {
					for (k = 0; k < m; k++)
						*rr++ = (*RS)[k];
					nrr -= k;
					if (c == EOF)
						break;
					*rr++ = c;
					nrr--;
					goto cont;
				}
			}
		}
		if (**RS == sep || c == EOF)
			break;
		if ((c = getc(inf)) == '\n' || c == EOF) /* 2 in a row */
			break;
		*rr++ = '\n';
		*rr++ = c;
	}
	/*if (rr > *buf + *bufsize)
		error(MM_ERROR, ":12:Input record `%.20s...' too long", *buf);*/
	*rr = 0;
	dprintf( ("readrec saw <%s>, returns %d\n", *buf, c == EOF
		&& rr == *buf ? 0 : 1) );
	return c == EOF && rr == *buf ? 0 : 1;
}
Пример #3
0
void
recbld(void)
{
	int i;
	register char *r, *p;

	if (donefld == 0 || donerec == 1)
		return;
	r = record;
	for (i = 1; i <= *NF; i++) {
		p = getsval(fldtab[i]);
		while (*r++ = *p++) {
			if (r >= record+RECSIZE) {
				size_t	diff = r - record;
				growrec();
				r = &record[diff];
			}
		}
		*(r-1) = **OFS;
	}
	*(r-1) = '\0';
	dprintf("in recbld FS=%lo, recloc=%lo\n", (long)**FS, (long)recloc);
	recloc->tval = STR | FLD;
	dprintf("in recbld FS=%lo, recloc=%lo\n", (long)**FS, (long)recloc);
	/*if (r > record+RECSIZE)
		error(FATAL, "built giant record `%.20s...'", record);*/
	dprintf("recbld = |%s|\n", record);
}
Пример #4
0
unsigned char *makerec(const unsigned char *data, int size)
{
	if (!(recloc->tval & DONTFREE))
		xfree(recloc->sval);
	if (recsize < size)
		growrec(&recdata, &recsize, size, NULL, 0);
	record = recdata;
	strcpy((char*)record, (char*)data);
	recloc->sval = record;
	recloc->tval = REC | STR | DONTFREE;
	donerec = 1; donefld = 0;
	return record;
}
Пример #5
0
void grow(tree_t* t, dataset_t* d){
    int i;

    /* Initialize root fields */
    t->root = malloc(sizeof(node_t));
    t->root->pos = FLT_EPSILON;
    t->root->neg = FLT_EPSILON;
    for(i=0; i<d->nex; i++){
        if(t->valid[i]<=0)
            continue;
        if(d->target[i])
            t->root->pos += d->weight[i];
        else
            t->root->neg += d->weight[i];
    }
    t->root->pos = min(1-FLT_EPSILON, t->root->pos);
    t->root->neg = min(1-FLT_EPSILON, t->root->neg);
    /* Recursively grow tree */
    growrec(t, t->root, d, 0);
}
Пример #6
0
void growrec(tree_t* t, node_t* root, dataset_t* d, int depth){
    split_t best;
    int i,k,l,u;
    node_t* first;
    node_t* second;
    evpair_t* b;

    /* Stop if max depth is reached or node is pure */
    if(depth>=t->maxdepth || root->pos <= FLT_EPSILON || root->neg <= FLT_EPSILON){
        root->split=-1;
        return;
    }

    /* Find the best split */
    best = bestSplit(t,root,d);

    /* Stop if no good split is left or the counts in one of the children are very small */
    if (best.feature < 0 || 
            (best.posleft <= FLT_EPSILON && best.negleft <= FLT_EPSILON) || 
            (best.posright <= FLT_EPSILON && best.negright <= FLT_EPSILON)){
        root->split=-1;
        return;
    }

    /* Install the split */
    root->split=best.feature;
    root->threshold=best.threshold;
    root->left=malloc(sizeof(node_t));
    root->left->pos=best.posleft;
    root->left->neg=best.negleft;
    root->right=malloc(sizeof(node_t));
    root->right->pos=best.posright;
    root->right->neg=best.negright;

    /* Mark the feature as used */
    if(!d->cont[best.feature])
        t->used[best.feature]=1;
    b = d->feature[best.feature];
    /* Find the first example whose value exceeds the threshold */
    k = 0;
    u = d->size[best.feature];
    while (k < u) {
        i = (k + u)/2;
        if (b[i].value > best.threshold)
            u = i;
        else
            k = i + 1;
    }
    if (best.threshold > 0){
        l=k;
        u=d->size[best.feature];
        first = root->left;
        second = root->right;
    }
    else{
        l=0;
        u=k;
        first = root->right;
        second = root->left;
    }
    /* Here's how this works when threshold > 0. The case where threshold < 0 is analogous:
     * Let X be the set of all examples whose feature best.feature has value > threshold 
     * For every x in X decrease valid[x] by 1.
     * This leads to valid[x] > 0 iff x was previously valid and has value < threshold
     *
     * build left subtree (using the valid examples) 
     *
     * For every x in X increase valid[x] by 2.
     * For every example x decrease valid[x] by 1.
     * This leads to valid[x] > 0 iff x was previously valid and has value > threshold
     *
     * build right subtree (using the valid examples)
     *
     * Finally restore: 
     * For every x in X decrease valid[x] by 1.
     * For every example x increase valid[x] by 1.
     * This makes valid obtain its original state 
     * (One can verify this by adding up all the transformations)
     */
    for(i=l; i<u; i++)
        t->valid[b[i].example]-=1;
    growrec(t, first, d, depth+1);
    for(i=l; i<u; i++)
        t->valid[b[i].example]+=2;
    for(i=0; i<d->nex; i++)
        t->valid[i]-=1;
    growrec(t, second, d, depth+1);
    for(i=l; i<u; i++)
        t->valid[b[i].example]-=1;
    for(i=0; i<d->nex; i++)
        t->valid[i]+=1;
    /* Unmark the feature */
    if(!d->cont[best.feature])
        t->used[best.feature]=0;
}
Пример #7
0
int
getrec(void)
{
	register char *rr;
	extern int svargc;
	extern char **svargv;
	register int c, sep, k, m, n;
	wchar_t wc;

	dprintf("**RS=%o, **FS=%o\n", **RS, **FS);
	donefld = 0;
	donerec = 1;
	record[0] = 0;
	while (svargc > 0) {
		dprintf("svargc=%d, *svargv=%s\n", svargc, *svargv);
		if (infile == NULL) {	/* have to open a new file */
			if (member('=', *svargv)) {	/* it's a var=value argument */
				setclvar(*svargv);
				svargv++;
				svargc--;
				continue;
			}
			*FILENAME = file = *svargv;
			dprintf("opening file %s\n", file);
			if (*file == '-') {
				if (yyin == stdin && ! lexprog)
					error(FATAL, "standard input already used for reading commands");
				else
					infile = stdin;
			}
			else if ((infile = fopen(file, "r")) == NULL)
				error(FATAL, "can't open %s", file);
		}
		next(wc, *RS, n);
		if ((sep = **RS) == 0)
			sep = '\n';
		for (rr = record; ; ) {
		cont:	for (; (c=getc(infile)) != sep && c != EOF; *rr++ = c) {
				if (rr >= record+RECSIZE-n-3) {
					size_t	diff = rr - record;
					growrec();
					rr = &record[diff];
				}
			}
			if (c != EOF) {
				/*
				 * Note: This code does not restrict occurences
				 * of the multibyte sequence in RS to the start
				 * of an input character.
				 */
				for (m = 1; m < n; m++) {
					if ((c = getc(infile)) == EOF ||
							c != (*RS)[m]) {
						for (k = 0; k < m; k++)
							*rr++ = (*RS)[k];
						if (c == EOF)
							break;
						*rr++ = c;
						goto cont;
					}

				}
			}
			if (**RS == sep || c == EOF)
				break;
			if ((c = getc(infile)) == '\n' || c == EOF)	/* 2 in a row */
				break;
			if (rr >= record+RECSIZE-n-3) {
				size_t	diff = rr - record;
				growrec();
				rr = &record[diff];
			}
			*rr++ = '\n';
			*rr++ = c;
		}
		*rr = 0;
		if (mustfld)
			fldbld();
		if (c != EOF || rr > record) {	/* normal record */
			recloc->tval &= ~NUM;
			recloc->tval |= STR;
			++nrloc->fval;
			nrloc->tval &= ~STR;
			nrloc->tval |= NUM;
			return(1);
		}
		/* EOF arrived on this file; set up next */
		if (infile != stdin)
			fclose(infile);
		infile = NULL;
		svargc--;
		svargv++;
	}
	return(0);	/* true end of file */
}