void recbld(void) { int i; unsigned char *r, *p; if (donerec == 1) return; r = recdata; for (i = 1; i <= *NF; i++) { p = getsval(fldtab[i]); while ((*r = *p++)) { if (++r >= &recdata[recsize]) { recsize += CHUNK; growrec(&recdata, &recsize, recsize, &r, 1); } } if (i < *NF) for ((p = *OFS); (*r = *p++); ) { if (++r >= &recdata[recsize]) { recsize += CHUNK; growrec(&recdata, &recsize, recsize, &r, 1); } } } *r = '\0'; dprintf( ("in recbld FS=%o, recloc=%lo\n", **FS, (long)recloc) ); recloc->tval = REC | STR | DONTFREE; recloc->sval = record = recdata; dprintf( ("in recbld FS=%o, recloc=%lo\n", **FS, (long)recloc) ); dprintf( ("recbld = |%s|\n", record) ); donerec = 1; }
int readrec(unsigned char **buf, int *bufsize, FILE *inf) /* read one record into buf */ { register int sep, c, k, m, n; unsigned char *rr; register int nrr; wchar_t wc; next(wc, *RS, n); if ((sep = **RS) == 0) { sep = '\n'; while ((c=getc(inf)) == '\n' && c != EOF) /* skip leading \n's */ ; if (c != EOF) ungetc(c, inf); } if (*bufsize == 0) growrec(buf, bufsize, CHUNK, NULL, 0); for (rr = *buf, nrr = *bufsize; ; ) { cont: for (; (c=getc(inf)) != sep && c != EOF; *rr++ = c) if (--nrr < n + 3) { growrec(buf, bufsize, *bufsize + CHUNK, &rr, 0); nrr += CHUNK; } if (c != EOF) { /* * Note: This code does not restrict occurences of * the multibyte sequence in RS to the start of an * input character. */ for (m = 1; m < n; m++) { if ((c = getc(inf)) == EOF || c != (*RS)[m]) { for (k = 0; k < m; k++) *rr++ = (*RS)[k]; nrr -= k; if (c == EOF) break; *rr++ = c; nrr--; goto cont; } } } if (**RS == sep || c == EOF) break; if ((c = getc(inf)) == '\n' || c == EOF) /* 2 in a row */ break; *rr++ = '\n'; *rr++ = c; } /*if (rr > *buf + *bufsize) error(MM_ERROR, ":12:Input record `%.20s...' too long", *buf);*/ *rr = 0; dprintf( ("readrec saw <%s>, returns %d\n", *buf, c == EOF && rr == *buf ? 0 : 1) ); return c == EOF && rr == *buf ? 0 : 1; }
void recbld(void) { int i; register char *r, *p; if (donefld == 0 || donerec == 1) return; r = record; for (i = 1; i <= *NF; i++) { p = getsval(fldtab[i]); while (*r++ = *p++) { if (r >= record+RECSIZE) { size_t diff = r - record; growrec(); r = &record[diff]; } } *(r-1) = **OFS; } *(r-1) = '\0'; dprintf("in recbld FS=%lo, recloc=%lo\n", (long)**FS, (long)recloc); recloc->tval = STR | FLD; dprintf("in recbld FS=%lo, recloc=%lo\n", (long)**FS, (long)recloc); /*if (r > record+RECSIZE) error(FATAL, "built giant record `%.20s...'", record);*/ dprintf("recbld = |%s|\n", record); }
unsigned char *makerec(const unsigned char *data, int size) { if (!(recloc->tval & DONTFREE)) xfree(recloc->sval); if (recsize < size) growrec(&recdata, &recsize, size, NULL, 0); record = recdata; strcpy((char*)record, (char*)data); recloc->sval = record; recloc->tval = REC | STR | DONTFREE; donerec = 1; donefld = 0; return record; }
void grow(tree_t* t, dataset_t* d){ int i; /* Initialize root fields */ t->root = malloc(sizeof(node_t)); t->root->pos = FLT_EPSILON; t->root->neg = FLT_EPSILON; for(i=0; i<d->nex; i++){ if(t->valid[i]<=0) continue; if(d->target[i]) t->root->pos += d->weight[i]; else t->root->neg += d->weight[i]; } t->root->pos = min(1-FLT_EPSILON, t->root->pos); t->root->neg = min(1-FLT_EPSILON, t->root->neg); /* Recursively grow tree */ growrec(t, t->root, d, 0); }
void growrec(tree_t* t, node_t* root, dataset_t* d, int depth){ split_t best; int i,k,l,u; node_t* first; node_t* second; evpair_t* b; /* Stop if max depth is reached or node is pure */ if(depth>=t->maxdepth || root->pos <= FLT_EPSILON || root->neg <= FLT_EPSILON){ root->split=-1; return; } /* Find the best split */ best = bestSplit(t,root,d); /* Stop if no good split is left or the counts in one of the children are very small */ if (best.feature < 0 || (best.posleft <= FLT_EPSILON && best.negleft <= FLT_EPSILON) || (best.posright <= FLT_EPSILON && best.negright <= FLT_EPSILON)){ root->split=-1; return; } /* Install the split */ root->split=best.feature; root->threshold=best.threshold; root->left=malloc(sizeof(node_t)); root->left->pos=best.posleft; root->left->neg=best.negleft; root->right=malloc(sizeof(node_t)); root->right->pos=best.posright; root->right->neg=best.negright; /* Mark the feature as used */ if(!d->cont[best.feature]) t->used[best.feature]=1; b = d->feature[best.feature]; /* Find the first example whose value exceeds the threshold */ k = 0; u = d->size[best.feature]; while (k < u) { i = (k + u)/2; if (b[i].value > best.threshold) u = i; else k = i + 1; } if (best.threshold > 0){ l=k; u=d->size[best.feature]; first = root->left; second = root->right; } else{ l=0; u=k; first = root->right; second = root->left; } /* Here's how this works when threshold > 0. The case where threshold < 0 is analogous: * Let X be the set of all examples whose feature best.feature has value > threshold * For every x in X decrease valid[x] by 1. * This leads to valid[x] > 0 iff x was previously valid and has value < threshold * * build left subtree (using the valid examples) * * For every x in X increase valid[x] by 2. * For every example x decrease valid[x] by 1. * This leads to valid[x] > 0 iff x was previously valid and has value > threshold * * build right subtree (using the valid examples) * * Finally restore: * For every x in X decrease valid[x] by 1. * For every example x increase valid[x] by 1. * This makes valid obtain its original state * (One can verify this by adding up all the transformations) */ for(i=l; i<u; i++) t->valid[b[i].example]-=1; growrec(t, first, d, depth+1); for(i=l; i<u; i++) t->valid[b[i].example]+=2; for(i=0; i<d->nex; i++) t->valid[i]-=1; growrec(t, second, d, depth+1); for(i=l; i<u; i++) t->valid[b[i].example]-=1; for(i=0; i<d->nex; i++) t->valid[i]+=1; /* Unmark the feature */ if(!d->cont[best.feature]) t->used[best.feature]=0; }
int getrec(void) { register char *rr; extern int svargc; extern char **svargv; register int c, sep, k, m, n; wchar_t wc; dprintf("**RS=%o, **FS=%o\n", **RS, **FS); donefld = 0; donerec = 1; record[0] = 0; while (svargc > 0) { dprintf("svargc=%d, *svargv=%s\n", svargc, *svargv); if (infile == NULL) { /* have to open a new file */ if (member('=', *svargv)) { /* it's a var=value argument */ setclvar(*svargv); svargv++; svargc--; continue; } *FILENAME = file = *svargv; dprintf("opening file %s\n", file); if (*file == '-') { if (yyin == stdin && ! lexprog) error(FATAL, "standard input already used for reading commands"); else infile = stdin; } else if ((infile = fopen(file, "r")) == NULL) error(FATAL, "can't open %s", file); } next(wc, *RS, n); if ((sep = **RS) == 0) sep = '\n'; for (rr = record; ; ) { cont: for (; (c=getc(infile)) != sep && c != EOF; *rr++ = c) { if (rr >= record+RECSIZE-n-3) { size_t diff = rr - record; growrec(); rr = &record[diff]; } } if (c != EOF) { /* * Note: This code does not restrict occurences * of the multibyte sequence in RS to the start * of an input character. */ for (m = 1; m < n; m++) { if ((c = getc(infile)) == EOF || c != (*RS)[m]) { for (k = 0; k < m; k++) *rr++ = (*RS)[k]; if (c == EOF) break; *rr++ = c; goto cont; } } } if (**RS == sep || c == EOF) break; if ((c = getc(infile)) == '\n' || c == EOF) /* 2 in a row */ break; if (rr >= record+RECSIZE-n-3) { size_t diff = rr - record; growrec(); rr = &record[diff]; } *rr++ = '\n'; *rr++ = c; } *rr = 0; if (mustfld) fldbld(); if (c != EOF || rr > record) { /* normal record */ recloc->tval &= ~NUM; recloc->tval |= STR; ++nrloc->fval; nrloc->tval &= ~STR; nrloc->tval |= NUM; return(1); } /* EOF arrived on this file; set up next */ if (infile != stdin) fclose(infile); infile = NULL; svargc--; svargv++; } return(0); /* true end of file */ }