示例#1
0
void PullOutMatchedReads(struct tree *t, FILE *fastq, FILE *out) {

	int location = 0;
	int write = 0;
	char *line = NULL;
	size_t length;
	ssize_t read;
	
    struct split_data *idFirst;
	struct split_data *idLast;

	char *tmp;	
	char *s;
	long long int id = 0;
    int i = 0;
	while ((read = getline(&line, &length, fastq)) != -1) {
		if (location == 0) {
	        idFirst = (struct split_data*)malloc(sizeof(struct split_data));
	        idLast = (struct split_data*)malloc(sizeof(struct split_data));
			idFirst = splitData(line, " ");
			idLast = splitData((idFirst->tokenize)[0], ":");
			s = (char *)malloc(100*sizeof(char));
            (idLast->tokenize)[idLast->elements-2] = PadWithZeros((idLast->tokenize)[idLast->elements-2]);
            (idLast->tokenize)[idLast->elements-1] = PadWithZeros((idLast->tokenize)[idLast->elements-1]);

			sprintf(s, "%s%s%s%s", (idLast->tokenize)[1], (idLast->tokenize)[idLast->elements-3], (idLast->tokenize)[idLast->elements-2], (idLast->tokenize)[idLast->elements-1]);
			id = strtoll(s, &tmp, 0);
			write = Lookup(t->root, id);
			location++;
            for (i = 0; i < idFirst->elements; i++) {
                free((idFirst->tokenize)[i]);
            }
            free((idFirst->tokenize));
            free(idFirst);
            for (i = 0; i < idLast->elements; i++) {
                free((idLast->tokenize)[i]);
            }
            free((idLast->tokenize));
            free(idLast);
            free(s);
		} else if (location == 3) {
			location = 0;
		} else {
			location++;
		}

		if (write) {
			fprintf(out, "%s", line);
		}	
		
	}
}
示例#2
0
DTNode::DTNode(const vector<Tuple*>& data,
          ARGS& myargs,
          int maxdepth,  // the maximum depth of the tree
          int depth, // current depth in the tree
          int fk,
          bool par)
    : leaf(0), feature(0), value(UNKNOWN), leaf_depth(depth), pred(-1), args(myargs){
    // initialize fields
    int i;
    for (i = 0; i < CHILDTYPES; i++) {
      child[i] = 0;
    }

   // fprintf(stderr, "DTNode: 1\n");
    // get prediction
    pred = (args.pred == ALG_MEAN) ? average(data) : mode(data);

    // check if leaf node
    if (depth == maxdepth || same(data)) {
      leaf = true;
      return;
    }
  // fprintf(stderr, "DTNode: 2\n");
    // find criterion to split data, if cant make this leaf node
    int f_split; double v_split;

    if (!findSplitByHistogram(data,
         myargs.max_feature_index, f_split, v_split, fk, par, args)) {
      leaf = true;
      return;
    }
 //  fprintf(stderr, "DTNode: 3\n");
    // split data into 3 parts, based on criteria found
    vector<Tuple*> child_data[CHILDTYPES];
    splitData(data, child_data, f_split, v_split, args);

    if (!(child_data[YES].size() && child_data[NO].size())) {
      leaf = true;
      return;
    }
 //  fprintf(stderr, "DTNode: 4\n");
    // remember where we splitted, and recurse
    feature = f_split;
    value = v_split;
    child[YES] = new DTNode(child_data[YES], myargs, maxdepth, depth + 1, fk, par);
    child[NO] = new DTNode(child_data[NO], myargs, maxdepth, depth + 1, fk, par);


    if (child_data[MISSING].size()) {
      child[MISSING] = new DTNode(child_data[MISSING], myargs, maxdepth, depth + 1, fk, par);
    } else {
      child[MISSING] = 0;
    }
  }
示例#3
0
void DiveThroughFixrank(struct tree *t, FILE *fixrank, char *taxlevel, char *name) {
	char *line = NULL;
	size_t length;
	ssize_t read;
	char *s;
	char *tmp;
	long long int id;
	struct split_data *nameSearch = (struct split_data*)malloc(sizeof(struct split_data));
	struct split_data *idFirst = (struct split_data*)malloc(sizeof(struct split_data));
	struct split_data *idLast = (struct split_data*)malloc(sizeof(struct split_data));
	int i = 0;

	/*Read the entire file*/
	while ((read = getline(&line, &length, fixrank)) != -1) {
		nameSearch = splitData(line, "\t");		
		for (i = 0; i < nameSearch->elements; i++) {
			/*if there is a match, the next should be taxlevel*/	
			if (strcmp(name, (nameSearch->tokenize)[i]) == 0) {
				if (strcmp(taxlevel, (nameSearch->tokenize)[i+1]) == 0) {
					idLast = splitData((nameSearch->tokenize)[0], "|");
					idFirst = splitData((idLast->tokenize)[0], ":");	
					s = (char *)malloc(100*sizeof(char));
                    (idFirst->tokenize)[5] = PadWithZeros((idFirst->tokenize)[5]);
                    (idFirst->tokenize)[6] = PadWithZeros((idFirst->tokenize)[6]);

                    sprintf(s, "%s%s%s%s", (idFirst->tokenize)[1], (idFirst->tokenize)[4], (idFirst->tokenize)[5], (idFirst->tokenize)[6]);
					id = strtoll(s, &tmp, 0);
					free(s);
					AddNode(&(t->root), id);
                    (t->count)++;
				}
				free((nameSearch->tokenize)[i]);
			}
		}

	}


    //printf("%d\n", (t->count));
}
	void SerialRandomTree::createSplitNodes(SerialTreeNodePtr sourceNode, int &newNodes){
		std::vector<SerialTreeNodePtr> splitNodes(2,SerialTreeNodePtr());

		sourceNode->m_Attribute = maxIndex(m_vals);   // find best attribute
		sourceNode->m_SplitPoint = m_splits[sourceNode->m_Attribute];
		sourceNode->m_Prop = m_props[sourceNode->m_Attribute];

		std::vector<std::vector<double>> chosenAttDists = m_dists[sourceNode->m_Attribute]; // remember dist for most important attribute
		m_dists.clear();
		m_splits.clear();
		m_props.clear();
		m_vals.clear();
      
		std::vector<std::vector<std::vector<int>>> subsetIndices(chosenAttDists.size(),std::vector<std::vector<int>>(data->numAttributes,std::vector<int>()));
		splitData(subsetIndices, sourceNode->m_Attribute, sourceNode->m_SplitPoint, sourceNode->sortedIndices);
      
		// Do not split if one branch is empty
		if(subsetIndices[0][0].size() == 0 || subsetIndices[1][0].size() == 0 ){
			createLeafNode(sourceNode);
			sourceNode->clean();
			return;
		}

		for(size_t i = 0; i < chosenAttDists.size(); i++){
			splitNodes[i] = SerialTreeNodePtr(new SerialTreeNode);

			// check if we're about to make an empty branch - this can happen with
			// nominal attributes with more than two categories (as of ver. 0.98)
			if(subsetIndices[i][0].size() == 0){
				for(size_t j = 0; j < chosenAttDists[i].size(); j++)
					chosenAttDists[i][j] = sourceNode->classProbs[j] / sourceNode->sortedIndices[0].size();
			}
			else{
				splitNodes[i]->m_Attribute = -1;
				splitNodes[i]->sortedIndices = subsetIndices[i];
				splitNodes[i]->classProbs = chosenAttDists[i];
				m_treeNodes.push_back(splitNodes[i]);
				newNodes++;
				sourceNode->m_children.push_back(splitNodes[i]);
			}
		}
		sourceNode->sortedIndices.clear();
	}
示例#5
0
int
main( int argc, char *argv[] )
{
    nn_type nn;
    double  atof();
    FILE    *log_file;
    FILE    *out_file;
    
    if( argc != 6 ) {
        fprintf( stderr, "Usage: nn learning_rate k hidden log_file out_file < digits_train.txt\n" );
        fprintf( stderr, "       log_file - file to record progress of training\n");
        fprintf( stderr, "       out_file - file to record final network\n");
        exit(0);
    }
    
    nn.learning_rate = atof( argv[1] );
    nn.k             = atof( argv[2] );
    nn.n_hidden      = atoi( argv[3] );
    
    if( (log_file = fopen( argv[4], "w" )) == NULL ) {
        fprintf( stderr, "Could not open file %s\n", argv[4] );
        exit( 0 );
    }
    
    if( (out_file = fopen( argv[5], "w" )) == NULL ) {
        fprintf( stderr, "Could not open file %s\n", argv[5] );
        exit( 0 );
    }
    
    fprintf( log_file, "learning rate: %0.2f\n", nn.learning_rate );
    fprintf( log_file, "multiplicative constant (k): %0.1f\n", nn.k );
    fprintf( log_file, "hidden units: %d\n", nn.n_hidden );
    
    /*
     *  Number of input lines.
     *  NO NEED TO CHANGE THIS.
     */
    nn.n_input = 64;
    
    /*
     *  Number of output lines.
     *  NO NEED TO CHANGE THIS.
     */
    nn.n_output = 10;
    
    /*
     *  Total amount of data.
     *  YOU MAY WISH TO CHANGE THIS.
     */
    all_data.n = N_EXAMPLES;  /* total amount of data */
    if( all_data.n > N_EXAMPLES ) {
        fprintf( stderr, "Too many examples; increase N_EXAMPLES\n" );
        exit( 0 );
    }
    
    readData( nn.n_input, nn.n_output, &all_data );
    splitData( nn.n_input, nn.n_output,
              &all_data, &training_data, &test_data );
    trainNetwork( log_file, &nn, &training_data );
    testNetwork(  log_file, &nn, &test_data );
    printNetwork( out_file, &nn );
    
    fclose( log_file );
    fclose( out_file );
}
示例#6
0
// add an element to the tree 
nodePtr buildIndex( 
    rootNodePtr r,      // root pointer 
    size_t dim,         // current dim
    size_t m,           // current length of obs
    size_t * indexPtr,  // pointer to obs indexes 
    int useProb,        // determine if we use probability to build an index
    double * prob,
    size_t * nodeIdentity
  ) {
 
  size_t i,K; 
  size_t * indexLeftPtr = NULL;
  size_t * indexRightPtr = NULL;
  size_t indexLeftSize;
  size_t indexRightSize;
  double probSum = 0;

  nodePtr c = createNode(r);

  // record to the tree structure the new tree 
  c->indexUsed = m;
  c->index = indexPtr;
  c->dim = dim;

  K = r->K;
   
  // do we have too many points? 
  if(!useProb) {
    if( m <= r->leafSize ) {
  
      // save the final pointer locations 
      for( i = 0; i < m; i++) { 
        // go through each element of indexPtr and store a pointer to that indexPtr element in pointerIndex
        r->pointerIndex[ indexPtr[i] ] = &( indexPtr[i] );
        r->nodeIndex[ indexPtr[i] ] = *nodeIdentity;
//        printf(" node assignment %d is %d\n", (int) indexPtr[i], (int) *nodeIdentity );
      }
      *nodeIdentity = *nodeIdentity + 1;
      return c;
    }
  } else {
  // if using probSize we want to figure out how many samples per psu
    for( i = 0; i < m; i++) {
      probSum += prob[ indexPtr[i] ];
      // go through each element of indexPtr and store a pointer to that indexPtr element in pointerIndex
    } 
    if(probSum <= r->leafSize) {
      for( i = 0; i < m; i++) { 
        r->pointerIndex[ indexPtr[i] ] = &( indexPtr[i] );
        r->nodeIndex[ indexPtr[i] ] = *nodeIdentity;
//        printf(" node assignment %d is %d\n", (int) indexPtr[i], (int) *nodeIdentity );
      }
      *nodeIdentity = *nodeIdentity + 1;
      return c;
    }
#ifdef DEBUG_PROB  
    printf("split!\n");
#endif
  } 

  // if we are here we have too many points 
  // create children
  // figure out our new dim
  // split data and give to children 

  if( useProb ) { 
    c->split = splitDataProb( 
      r->data,
      c->index, 
      &indexLeftPtr,
      &indexRightPtr,
      &indexLeftSize,
      &indexRightSize,
      m, 
      K,
      dim,
      prob 
      ); 
#ifdef DEBUG_PROB  
    printf("Left Side Size = %d, Right Side Size = %d split = %f\n", (int) indexLeftSize, (int) indexRightSize, c->split);
    printf("Left\n:");
    for(i=0; i < indexLeftSize; i++) printf("%d ", (int) indexLeftPtr[i]);
    printf("\nRight\n:");
    for(i=0; i < indexRightSize; i++) printf("%d ", (int) indexRightPtr[i]);
    printf("\n");
#endif
  } else {
    c-> split = splitData( 
      r->data,
      c->index, 
      &indexLeftPtr,
      &indexRightPtr,
      &indexLeftSize,
      &indexRightSize,
      m, 
      K,
      dim  
      ); 
  }

  free(c->index);
  c->index = NULL; 

  // move current contents to new children
  c->left  = buildIndex( r, (dim+1) % K, indexLeftSize , indexLeftPtr,  useProb, prob, nodeIdentity);
  c->right = buildIndex( r, (dim+1) % K, indexRightSize, indexRightPtr, useProb, prob, nodeIdentity);

  return c;
}