void PullOutMatchedReads(struct tree *t, FILE *fastq, FILE *out) { int location = 0; int write = 0; char *line = NULL; size_t length; ssize_t read; struct split_data *idFirst; struct split_data *idLast; char *tmp; char *s; long long int id = 0; int i = 0; while ((read = getline(&line, &length, fastq)) != -1) { if (location == 0) { idFirst = (struct split_data*)malloc(sizeof(struct split_data)); idLast = (struct split_data*)malloc(sizeof(struct split_data)); idFirst = splitData(line, " "); idLast = splitData((idFirst->tokenize)[0], ":"); s = (char *)malloc(100*sizeof(char)); (idLast->tokenize)[idLast->elements-2] = PadWithZeros((idLast->tokenize)[idLast->elements-2]); (idLast->tokenize)[idLast->elements-1] = PadWithZeros((idLast->tokenize)[idLast->elements-1]); sprintf(s, "%s%s%s%s", (idLast->tokenize)[1], (idLast->tokenize)[idLast->elements-3], (idLast->tokenize)[idLast->elements-2], (idLast->tokenize)[idLast->elements-1]); id = strtoll(s, &tmp, 0); write = Lookup(t->root, id); location++; for (i = 0; i < idFirst->elements; i++) { free((idFirst->tokenize)[i]); } free((idFirst->tokenize)); free(idFirst); for (i = 0; i < idLast->elements; i++) { free((idLast->tokenize)[i]); } free((idLast->tokenize)); free(idLast); free(s); } else if (location == 3) { location = 0; } else { location++; } if (write) { fprintf(out, "%s", line); } } }
DTNode::DTNode(const vector<Tuple*>& data, ARGS& myargs, int maxdepth, // the maximum depth of the tree int depth, // current depth in the tree int fk, bool par) : leaf(0), feature(0), value(UNKNOWN), leaf_depth(depth), pred(-1), args(myargs){ // initialize fields int i; for (i = 0; i < CHILDTYPES; i++) { child[i] = 0; } // fprintf(stderr, "DTNode: 1\n"); // get prediction pred = (args.pred == ALG_MEAN) ? average(data) : mode(data); // check if leaf node if (depth == maxdepth || same(data)) { leaf = true; return; } // fprintf(stderr, "DTNode: 2\n"); // find criterion to split data, if cant make this leaf node int f_split; double v_split; if (!findSplitByHistogram(data, myargs.max_feature_index, f_split, v_split, fk, par, args)) { leaf = true; return; } // fprintf(stderr, "DTNode: 3\n"); // split data into 3 parts, based on criteria found vector<Tuple*> child_data[CHILDTYPES]; splitData(data, child_data, f_split, v_split, args); if (!(child_data[YES].size() && child_data[NO].size())) { leaf = true; return; } // fprintf(stderr, "DTNode: 4\n"); // remember where we splitted, and recurse feature = f_split; value = v_split; child[YES] = new DTNode(child_data[YES], myargs, maxdepth, depth + 1, fk, par); child[NO] = new DTNode(child_data[NO], myargs, maxdepth, depth + 1, fk, par); if (child_data[MISSING].size()) { child[MISSING] = new DTNode(child_data[MISSING], myargs, maxdepth, depth + 1, fk, par); } else { child[MISSING] = 0; } }
void DiveThroughFixrank(struct tree *t, FILE *fixrank, char *taxlevel, char *name) { char *line = NULL; size_t length; ssize_t read; char *s; char *tmp; long long int id; struct split_data *nameSearch = (struct split_data*)malloc(sizeof(struct split_data)); struct split_data *idFirst = (struct split_data*)malloc(sizeof(struct split_data)); struct split_data *idLast = (struct split_data*)malloc(sizeof(struct split_data)); int i = 0; /*Read the entire file*/ while ((read = getline(&line, &length, fixrank)) != -1) { nameSearch = splitData(line, "\t"); for (i = 0; i < nameSearch->elements; i++) { /*if there is a match, the next should be taxlevel*/ if (strcmp(name, (nameSearch->tokenize)[i]) == 0) { if (strcmp(taxlevel, (nameSearch->tokenize)[i+1]) == 0) { idLast = splitData((nameSearch->tokenize)[0], "|"); idFirst = splitData((idLast->tokenize)[0], ":"); s = (char *)malloc(100*sizeof(char)); (idFirst->tokenize)[5] = PadWithZeros((idFirst->tokenize)[5]); (idFirst->tokenize)[6] = PadWithZeros((idFirst->tokenize)[6]); sprintf(s, "%s%s%s%s", (idFirst->tokenize)[1], (idFirst->tokenize)[4], (idFirst->tokenize)[5], (idFirst->tokenize)[6]); id = strtoll(s, &tmp, 0); free(s); AddNode(&(t->root), id); (t->count)++; } free((nameSearch->tokenize)[i]); } } } //printf("%d\n", (t->count)); }
void SerialRandomTree::createSplitNodes(SerialTreeNodePtr sourceNode, int &newNodes){ std::vector<SerialTreeNodePtr> splitNodes(2,SerialTreeNodePtr()); sourceNode->m_Attribute = maxIndex(m_vals); // find best attribute sourceNode->m_SplitPoint = m_splits[sourceNode->m_Attribute]; sourceNode->m_Prop = m_props[sourceNode->m_Attribute]; std::vector<std::vector<double>> chosenAttDists = m_dists[sourceNode->m_Attribute]; // remember dist for most important attribute m_dists.clear(); m_splits.clear(); m_props.clear(); m_vals.clear(); std::vector<std::vector<std::vector<int>>> subsetIndices(chosenAttDists.size(),std::vector<std::vector<int>>(data->numAttributes,std::vector<int>())); splitData(subsetIndices, sourceNode->m_Attribute, sourceNode->m_SplitPoint, sourceNode->sortedIndices); // Do not split if one branch is empty if(subsetIndices[0][0].size() == 0 || subsetIndices[1][0].size() == 0 ){ createLeafNode(sourceNode); sourceNode->clean(); return; } for(size_t i = 0; i < chosenAttDists.size(); i++){ splitNodes[i] = SerialTreeNodePtr(new SerialTreeNode); // check if we're about to make an empty branch - this can happen with // nominal attributes with more than two categories (as of ver. 0.98) if(subsetIndices[i][0].size() == 0){ for(size_t j = 0; j < chosenAttDists[i].size(); j++) chosenAttDists[i][j] = sourceNode->classProbs[j] / sourceNode->sortedIndices[0].size(); } else{ splitNodes[i]->m_Attribute = -1; splitNodes[i]->sortedIndices = subsetIndices[i]; splitNodes[i]->classProbs = chosenAttDists[i]; m_treeNodes.push_back(splitNodes[i]); newNodes++; sourceNode->m_children.push_back(splitNodes[i]); } } sourceNode->sortedIndices.clear(); }
int main( int argc, char *argv[] ) { nn_type nn; double atof(); FILE *log_file; FILE *out_file; if( argc != 6 ) { fprintf( stderr, "Usage: nn learning_rate k hidden log_file out_file < digits_train.txt\n" ); fprintf( stderr, " log_file - file to record progress of training\n"); fprintf( stderr, " out_file - file to record final network\n"); exit(0); } nn.learning_rate = atof( argv[1] ); nn.k = atof( argv[2] ); nn.n_hidden = atoi( argv[3] ); if( (log_file = fopen( argv[4], "w" )) == NULL ) { fprintf( stderr, "Could not open file %s\n", argv[4] ); exit( 0 ); } if( (out_file = fopen( argv[5], "w" )) == NULL ) { fprintf( stderr, "Could not open file %s\n", argv[5] ); exit( 0 ); } fprintf( log_file, "learning rate: %0.2f\n", nn.learning_rate ); fprintf( log_file, "multiplicative constant (k): %0.1f\n", nn.k ); fprintf( log_file, "hidden units: %d\n", nn.n_hidden ); /* * Number of input lines. * NO NEED TO CHANGE THIS. */ nn.n_input = 64; /* * Number of output lines. * NO NEED TO CHANGE THIS. */ nn.n_output = 10; /* * Total amount of data. * YOU MAY WISH TO CHANGE THIS. */ all_data.n = N_EXAMPLES; /* total amount of data */ if( all_data.n > N_EXAMPLES ) { fprintf( stderr, "Too many examples; increase N_EXAMPLES\n" ); exit( 0 ); } readData( nn.n_input, nn.n_output, &all_data ); splitData( nn.n_input, nn.n_output, &all_data, &training_data, &test_data ); trainNetwork( log_file, &nn, &training_data ); testNetwork( log_file, &nn, &test_data ); printNetwork( out_file, &nn ); fclose( log_file ); fclose( out_file ); }
// add an element to the tree nodePtr buildIndex( rootNodePtr r, // root pointer size_t dim, // current dim size_t m, // current length of obs size_t * indexPtr, // pointer to obs indexes int useProb, // determine if we use probability to build an index double * prob, size_t * nodeIdentity ) { size_t i,K; size_t * indexLeftPtr = NULL; size_t * indexRightPtr = NULL; size_t indexLeftSize; size_t indexRightSize; double probSum = 0; nodePtr c = createNode(r); // record to the tree structure the new tree c->indexUsed = m; c->index = indexPtr; c->dim = dim; K = r->K; // do we have too many points? if(!useProb) { if( m <= r->leafSize ) { // save the final pointer locations for( i = 0; i < m; i++) { // go through each element of indexPtr and store a pointer to that indexPtr element in pointerIndex r->pointerIndex[ indexPtr[i] ] = &( indexPtr[i] ); r->nodeIndex[ indexPtr[i] ] = *nodeIdentity; // printf(" node assignment %d is %d\n", (int) indexPtr[i], (int) *nodeIdentity ); } *nodeIdentity = *nodeIdentity + 1; return c; } } else { // if using probSize we want to figure out how many samples per psu for( i = 0; i < m; i++) { probSum += prob[ indexPtr[i] ]; // go through each element of indexPtr and store a pointer to that indexPtr element in pointerIndex } if(probSum <= r->leafSize) { for( i = 0; i < m; i++) { r->pointerIndex[ indexPtr[i] ] = &( indexPtr[i] ); r->nodeIndex[ indexPtr[i] ] = *nodeIdentity; // printf(" node assignment %d is %d\n", (int) indexPtr[i], (int) *nodeIdentity ); } *nodeIdentity = *nodeIdentity + 1; return c; } #ifdef DEBUG_PROB printf("split!\n"); #endif } // if we are here we have too many points // create children // figure out our new dim // split data and give to children if( useProb ) { c->split = splitDataProb( r->data, c->index, &indexLeftPtr, &indexRightPtr, &indexLeftSize, &indexRightSize, m, K, dim, prob ); #ifdef DEBUG_PROB printf("Left Side Size = %d, Right Side Size = %d split = %f\n", (int) indexLeftSize, (int) indexRightSize, c->split); printf("Left\n:"); for(i=0; i < indexLeftSize; i++) printf("%d ", (int) indexLeftPtr[i]); printf("\nRight\n:"); for(i=0; i < indexRightSize; i++) printf("%d ", (int) indexRightPtr[i]); printf("\n"); #endif } else { c-> split = splitData( r->data, c->index, &indexLeftPtr, &indexRightPtr, &indexLeftSize, &indexRightSize, m, K, dim ); } free(c->index); c->index = NULL; // move current contents to new children c->left = buildIndex( r, (dim+1) % K, indexLeftSize , indexLeftPtr, useProb, prob, nodeIdentity); c->right = buildIndex( r, (dim+1) % K, indexRightSize, indexRightPtr, useProb, prob, nodeIdentity); return c; }