コード例 #1
0
void GroupFromAnnotation::addLineFromVcf( String & buffer )
{
// sample:
// ANNO=Nonsynonymous:ASB16;
// ANNOFULL=ASB16/NM_080863:+:Nonsynonymous(CCC/Pro/P->ACC/Thr/T:Base1310/1362:Codon437/454:Exon5/5):Exon
//	|C17orf65/NM_178542:-:Intron

	StringArray vfield;
	vfield.AddTokens(buffer, "\t");
	if ( vfield.Length() < 8 )
		error("Annotation vcf only has %d columns!\n", vfield.Length());
	StringArray info_semicolon;
	info_semicolon.AddTokens( vfield[7],";" );
	
// find ANNOFULL first
	int annofull_index = -1;
	for( int i=0; i<info_semicolon.Length(); i++ ) {
		String iheader = info_semicolon[i].SubStr(0,8);
		if (iheader == "ANNOFULL") {
			annofull_index = i;
			break;
		}
	}
	if (annofull_index == -1) {
		printf("warning: no ANNOFULL field at chr%s:%s. Variant won't included in groups!\n", info_semicolon[0].c_str(), info_semicolon[1].c_str());
		return;
	}

// remove ANNOFULL=
	String anno_full_str = info_semicolon[annofull_index].SubStr(9);

// check each alternative field
	StringArray alts;
	alts.AddTokens( anno_full_str, "|" );
	for( int a=0; a<alts.Length(); a++ ) {
		StringArray sub;
		sub.AddTokens( alts[a], ":/=");
		if (func_upper.Length() != 0) { // match before add
			for(int f =0;f<func_upper.Length();f++) {
				bool pattern_match = checkPatternMatch( sub, func_upper[f] );
				if ( pattern_match ) {
					addGeneFromVcf( vfield, sub[0] );
					break;
				}
			}
		}
		else { // no pattern to match: check if intergenic first
			String upper_name = sub[0].ToUpper();
			if ( !upper_name.SlowFind( "INTERGENIC" ) )
				addGeneFromVcf( vfield, sub[0] );
		}
	}	
}
コード例 #2
0
ファイル: Main.cpp プロジェクト: gpcr/rvtests
int loadMatrix(Matrix& a, String& fileName) {
  a.Zero();

  IFILE ifile(fileName.c_str(), "r");
  String line;
  StringArray array;
  int lineNo = 0;
  while (!ifeof(ifile)) {
    line.ReadLine(ifile);
    lineNo++;
    if (line.Length() == 0) continue;
    array.Clear();
    array.AddTokens(line);
    if (a.cols != 0 && a.cols != array.Length() && line.Length() > 0) {
      fprintf(stderr, "Wrong column size at line %d!\n", lineNo);
      array.Print();
      line.Write(stdout);
      return -1;
    } else {
      a.GrowTo(a.rows, array.Length());
    }
    if (a.rows < lineNo) {
      a.GrowTo(a.rows + 1, a.cols);
    }
    for (int i = 0; i < array.Length(); i++) {
      a[lineNo - 1][i] = atol(array[i]);
    }
  }

  // a.Print(stdout);
  return 0;
};
コード例 #3
0
ファイル: Main.cpp プロジェクト: gpcr/rvtests
int loadVector(Vector& a, String& fileName) {
  a.Zero();

  IFILE ifile(fileName.c_str(), "r");
  String line;
  StringArray array;
  int lineNo = 0;
  while (!ifeof(ifile)) {
    line.ReadLine(ifile);
    lineNo++;
    if (line.Length() == 0) continue;
    array.Clear();
    array.AddTokens(line);
    if (array.Length() > 1 && line.Length() > 0) {
      fprintf(stderr, "Warning: column size at line %d!\n", lineNo);
      array.Print();
      line.Write(stdout);
      return -1;
    }
    if (a.dim < lineNo) {
      a.GrowTo(a.dim + 1);
    }
    a[lineNo - 1] = atol(array[0]);
  }

  // a.Print(stdout);

  return 0;
};
コード例 #4
0
ファイル: GCContent.cpp プロジェクト: BioInfoTools/qplot
void GCContent::LoadRegions(String & regionsFile, GenomeSequence &genome, bool invertRegion)
{
    if(regionsFile.Length()==0) return;
    if(genome.sequenceLength()==0) error("No reference genome loaded!\n");

    IFILE fhRegions;
    fhRegions = ifopen(regionsFile.c_str(),"r");
    if(fhRegions==NULL)
        error("Open regions file %s failed!\n", regionsFile.c_str());

    regionIndicator.resize(genome.sequenceLength());

    StringArray tokens;
    String buffer;
    int len;

    fprintf(stderr, "Loading region list...");

    while (!ifeof(fhRegions)){
        buffer.ReadLine(fhRegions);
        if (buffer.IsEmpty() || buffer[0] == '#') continue;

        tokens.AddTokens(buffer, WHITESPACE);
        if(tokens.Length() < 3) continue;

        genomeIndex_t startGenomeIndex = 0;
        int chromosomeIndex = tokens[1].AsInteger();

        // use chromosome name (token[0]) and position (token[1]) to query genome index.
        startGenomeIndex = genome.getGenomePosition(tokens[0].c_str(), chromosomeIndex);

        if(startGenomeIndex >= regionIndicator.size() ) {
            //fprintf(stderr, "WARNING: region list section %s position %u is not found in the reference and skipped...\n", tokens[0].c_str(), chromosomeIndex);
            continue;
        }

        len = tokens[2].AsInteger() - tokens[1].AsInteger() + 1;
        for(uint32_t i=startGenomeIndex; i<startGenomeIndex+len; i++)
            regionIndicator[i] = true;

        tokens.Clear();
        buffer.Clear();
    }

    if (invertRegion) {
        fprintf(stderr, " invert region...");
        for (uint32_t i = 0; i < regionIndicator.size(); i++) {
            regionIndicator[i] = !regionIndicator[i];
        }
    }

    ifclose(fhRegions);
    fprintf(stderr, "DONE!\n");
}
コード例 #5
0
ファイル: Main.cpp プロジェクト: aminzia/statgen
void StringToArray(const String & input, IntArray & values, int desired)
   {
   StringArray tokens;
   tokens.AddTokens(input, ',');

   values.Dimension(desired);
   values.Zero();

   if (tokens.Length())
      for (int i = 0; i < desired; i++)
         values[i] = tokens[i % tokens.Length()].AsInteger();
   }
コード例 #6
0
// partition 8th column only
void GroupFromAnnotation::GetGroupFromVCF()
{  
   printf("Parsing annotations from annotated VCF file ...\n");
   StringArray func;
   func.AddTokens(function,"/");
   
   vcfInitialize(); // set size of the tables

   FILE * inFile;
   inFile = fopen(vcfInput,"r");
   StringIntHash groupHash;
   int geneCount=0;

// add all genes to group hash first
	while (!feof(inFile))
	{
		String buffer;
		buffer.ReadLine(inFile);
		if ( buffer[0] == '#' )
			continue;
		addLineFromVcf( buffer );
	}
	fclose(inFile);
	
// sort SNPlist and SNPNoAllele
	for( int g=0; g<geneCount; g++ ) {
		if ( SNPlist[g].Length()>1 ) {
			Vector order;
			setOrderFromSortedPositions( g, order);
			
			StringArray cp_SNPlist,cp_SNPNoAllele;
			cp_SNPlist.Dimension(SNPlist[g].Length());
			cp_SNPNoAllele.Dimension(SNPNoAllele[g].Length());
			for(int l=0;l<SNPlist[g].Length();l++) {
				cp_SNPlist[l] = SNPlist[g][l];
				cp_SNPNoAllele[l] = SNPNoAllele[g][l];
			}
			for(int i=0;i<order.Length();i++) {
				SNPlist[g][i] = cp_SNPlist[order[i]];
				SNPNoAllele[g][i] = cp_SNPNoAllele[order[i]] ;
			}
		}
	}

// print test.groupfile
	printf("done!\n");
	String grp_filename = "test.groupfile";
	printGroupFile( grp_filename );
}	
コード例 #7
0
ファイル: Stats.cpp プロジェクト: BioScripts/bamUtil
int Stats::execute(int argc, char **argv)
{
    // Extract command line arguments.
    String inFile = "";
    String indexFile = "";
    bool basic = false;
    bool noeof = false;
    bool params = false;
    bool qual = false;
    bool phred = false;
    int maxNumReads = -1;
    bool unmapped = false;
    String pBaseQC = "";
    String cBaseQC = "";
    String regionList = "";
    int excludeFlags = 0;
    int requiredFlags = 0;
    bool withinRegion = false;
    int minMapQual = 0;
    String dbsnp = "";
    PosList *dbsnpListPtr = NULL;
    bool baseSum = false;
    int bufferSize = PileupHelper::DEFAULT_WINDOW_SIZE;

    ParameterList inputParameters;
    BEGIN_LONG_PARAMETERS(longParameterList)
        LONG_PARAMETER_GROUP("Required Parameters")
        LONG_STRINGPARAMETER("in", &inFile)
        LONG_PARAMETER_GROUP("Types of Statistics")
        LONG_PARAMETER("basic", &basic)
        LONG_PARAMETER("qual", &qual)
        LONG_PARAMETER("phred", &phred)
        LONG_STRINGPARAMETER("pBaseQC", &pBaseQC)
        LONG_STRINGPARAMETER("cBaseQC", &cBaseQC)
        LONG_PARAMETER_GROUP("Optional Parameters")
        LONG_INTPARAMETER("maxNumReads", &maxNumReads)
        LONG_PARAMETER("unmapped", &unmapped)
        LONG_STRINGPARAMETER("bamIndex", &indexFile)
        LONG_STRINGPARAMETER("regionList", &regionList)
        LONG_INTPARAMETER("excludeFlags", &excludeFlags)
        LONG_INTPARAMETER("requiredFlags", &requiredFlags)
        LONG_PARAMETER("noeof", &noeof)
        LONG_PARAMETER("params", &params)
        LONG_PARAMETER_GROUP("Optional phred/qual Only Parameters")
        LONG_PARAMETER("withinRegion", &withinRegion)
        LONG_PARAMETER_GROUP("Optional BaseQC Only Parameters")
        LONG_PARAMETER("baseSum", &baseSum)
        LONG_INTPARAMETER("bufferSize", &bufferSize)
        LONG_INTPARAMETER("minMapQual", &minMapQual)
        LONG_STRINGPARAMETER("dbsnp", &dbsnp)
        END_LONG_PARAMETERS();
   
    inputParameters.Add(new LongParameters ("Input Parameters", 
                                            longParameterList));

    inputParameters.Read(argc-1, &(argv[1]));

    // If no eof block is required for a bgzf file, set the bgzf file type to 
    // not look for it.
    if(noeof)
    {
        // Set that the eof block is not required.
        BgzfFileType::setRequireEofBlock(false);
    }

    // Check to see if the in file was specified, if not, report an error.
    if(inFile == "")
    {
        usage();
        inputParameters.Status();
        // In file was not specified but it is mandatory.
        std::cerr << "--in is a mandatory argument for stats, "
                  << "but was not specified" << std::endl;
        return(-1);
    }

    // Use the index file if unmapped or regionList is not empty.
    bool useIndex = (unmapped|| (!regionList.IsEmpty()));

    // IndexFile is required, so check to see if it has been set.
    if(useIndex && (indexFile == ""))
    {
        // In file was not specified, so set it to the in file
        // + ".bai"
        indexFile = inFile + ".bai";
    }
    ////////////////////////////////////////
    // Setup in case pileup is used.
    Pileup<PileupElementBaseQCStats> pileup(bufferSize);
    // Initialize start/end positions.
    myStartPos = 0;
    myEndPos = -1;
    
    // Open the output qc file if applicable.
    IFILE baseQCPtr = NULL;
    if(!pBaseQC.IsEmpty() && !cBaseQC.IsEmpty())
    {
        usage();
        inputParameters.Status();
        // Cannot specify both types of baseQC.
        std::cerr << "Cannot specify both --pBaseQC & --cBaseQC." << std::endl;
        return(-1);
    }
    else if(!pBaseQC.IsEmpty())
    {
        baseQCPtr = ifopen(pBaseQC, "w");
        PileupElementBaseQCStats::setPercentStats(true);
    }
    else if(!cBaseQC.IsEmpty())
    {
        baseQCPtr = ifopen(cBaseQC, "w");
        PileupElementBaseQCStats::setPercentStats(false);
    }

    if(baseQCPtr != NULL)
    {
        PileupElementBaseQCStats::setOutputFile(baseQCPtr);
        PileupElementBaseQCStats::printHeader();
    }
    if((baseQCPtr != NULL) || baseSum)
    {
        PileupElementBaseQCStats::setMapQualFilter(minMapQual);
        PileupElementBaseQCStats::setBaseSum(baseSum);
    }

    if(params)
    {
        inputParameters.Status();
    }

    // Open the file for reading.
    SamFile samIn;
    if(!samIn.OpenForRead(inFile))
    {
        fprintf(stderr, "%s\n", samIn.GetStatusMessage());
        return(samIn.GetStatus());
    }

    samIn.SetReadFlags(requiredFlags, excludeFlags);

    // Set whether or not basic statistics should be generated.
    samIn.GenerateStatistics(basic);

    // Read the sam header.
    SamFileHeader samHeader;
    if(!samIn.ReadHeader(samHeader))
    {
        fprintf(stderr, "%s\n", samIn.GetStatusMessage());
        return(samIn.GetStatus());
    }

    // Open the bam index file for reading if we are
    // doing unmapped reads (also set the read section).
    if(useIndex)
    {
        samIn.ReadBamIndex(indexFile);

        if(unmapped)
        {
            samIn.SetReadSection(-1);
        }

        if(!regionList.IsEmpty())
        {
            myRegionList = ifopen(regionList, "r");
        }
    }

    //////////////////////////
    // Read dbsnp if specified and doing baseQC
    if(((baseQCPtr != NULL) || baseSum) && (!dbsnp.IsEmpty()))
    {
        // Read the dbsnp file.
        IFILE fdbSnp;
        fdbSnp = ifopen(dbsnp,"r");
        // Determine how many entries.
        const SamReferenceInfo& refInfo = samHeader.getReferenceInfo();
        int maxRefLen = 0;
        for(int i = 0; i < refInfo.getNumEntries(); i++)
        {
            int refLen = refInfo.getReferenceLength(i);
            if(refLen >= maxRefLen)
            {
                maxRefLen = refLen + 1;
            }
        }
        
        dbsnpListPtr = new PosList(refInfo.getNumEntries(),maxRefLen);

        if(fdbSnp==NULL)
        {
            std::cerr << "Open dbSNP file " << dbsnp.c_str() << " failed!\n";
        }
        else if(dbsnpListPtr == NULL)
        {
            std::cerr << "Failed to init the memory allocation for the dbsnpList.\n";
        }
        else
        {
            // Read the dbsnp file.
            StringArray tokens;
            String buffer;
            int position = 0;
            int refID = 0;

            // Loop til the end of the file.
            while (!ifeof(fdbSnp))
            {
                // Read the next line.
                buffer.ReadLine(fdbSnp);
                // If it does not have at least 2 columns, 
                // continue to the next line.
                if (buffer.IsEmpty() || buffer[0] == '#') continue;
                tokens.AddTokens(buffer);
                if(tokens.Length() < 2) continue;

                if(!tokens[1].AsInteger(position))
                {
                    std::cerr << "Improperly formatted region line, start position "
                              << "(2nd column) is not an integer: "
                              << tokens[1]
                              << "; Skipping to the next line.\n";         
                    continue;
                }

                // Look up the reference name.
                refID = samHeader.getReferenceID(tokens[0]);
                if(refID != SamReferenceInfo::NO_REF_ID)
                {
                    // Reference id was found, so add it to the dbsnp
                    dbsnpListPtr->addPosition(refID, position);
                }
        
                tokens.Clear();
                buffer.Clear();
            }
        }
        ifclose(fdbSnp);
    }

    // Read the sam records.
    SamRecord samRecord;

    int numReads = 0;

    //////////////////////
    // Setup in case doing a quality count.
    // Quality histogram.
    const int MAX_QUAL = 126;
    const int START_QUAL = 33;
    uint64_t qualCount[MAX_QUAL+1];
    for(int i = 0; i <= MAX_QUAL; i++)
    {
        qualCount[i] = 0;
    }
    
    const int START_PHRED = 0;
    const int PHRED_DIFF = START_QUAL - START_PHRED;
    const int MAX_PHRED = MAX_QUAL - PHRED_DIFF;
    uint64_t phredCount[MAX_PHRED+1];
    for(int i = 0; i <= MAX_PHRED; i++)
    {
        phredCount[i] = 0;
    }
    
    int refPos = 0;
    Cigar* cigarPtr = NULL;
    char cigarChar = '?';
    // Exclude clips from the qual/phred counts if unmapped reads are excluded.
    bool qualExcludeClips = excludeFlags & SamFlag::UNMAPPED;

    //////////////////////////////////
    // When not reading by sections, getNextSection returns true
    // the first time, then false the next time.
    while(getNextSection(samIn))
    {
        // Keep reading records from the file until SamFile::ReadRecord
        // indicates to stop (returns false).
        while(((maxNumReads < 0) || (numReads < maxNumReads)) && samIn.ReadRecord(samHeader, samRecord))
        {
            // Another record was read, so increment the number of reads.
            ++numReads;
            // See if the quality histogram should be genereated.
            if(qual || phred)
            {
                // Get the quality.
                const char* qual = samRecord.getQuality();
                // Check for no quality ('*').
                if((qual[0] == '*') && (qual[1] == 0))
                {
                    // This record does not have a quality string, so no 
                    // quality processing is necessary.
                }
                else
                {
                    int index = 0;
                    cigarPtr = samRecord.getCigarInfo();
                    cigarChar = '?';
                    refPos = samRecord.get0BasedPosition();
                    if(!qualExcludeClips && (cigarPtr != NULL))
                    {
                        // Offset the reference position by any soft clips
                        // by subtracting the queryIndex of this start position.
                        // refPos is now the start position of the clips.
                        refPos -= cigarPtr->getQueryIndex(0);
                    }

                    while(qual[index] != 0)
                    {
                        // Skip this quality if it is clipped and we are skipping clips.
                        if(cigarPtr != NULL)
                        {
                            cigarChar = cigarPtr->getCigarCharOpFromQueryIndex(index);
                        }
                        if(qualExcludeClips && Cigar::isClip(cigarChar))
                        {
                            // Skip a clipped quality.
                            ++index;
                            // Increment the position.
                            continue;
                        }

                        if(withinRegion && (myEndPos != -1) && (refPos >= myEndPos))
                        {
                            // We have hit the end of the region, stop processing this
                            // quality string.
                            break;
                        }

                        if(withinRegion && (refPos < myStartPos))
                        {
                            // This position is not in the target.
                            ++index;
                            // Update the position if this is found in the reference or a clip.
                            if(Cigar::foundInReference(cigarChar) || Cigar::isClip(cigarChar))
                            {
                                ++refPos;
                            }
                            continue;
                        }

                        // Check for valid quality.
                        if((qual[index] < START_QUAL) || (qual[index] > MAX_QUAL))
                        {
                            if(qual)
                            {
                                std::cerr << "Invalid Quality found: " << qual[index] 
                                          << ".  Must be between "
                                          << START_QUAL << " and " << MAX_QUAL << ".\n";
                            }
                            if(phred)
                            {
                                std::cerr << "Invalid Phred Quality found: " << qual[index] - PHRED_DIFF
                                          << ".  Must be between "
                                          << START_QUAL << " and " << MAX_QUAL << ".\n";
                            }
                            // Skip an invalid quality.
                            ++index;
                            // Update the position if this is found in the reference or a clip.
                            if(Cigar::foundInReference(cigarChar) || Cigar::isClip(cigarChar))
                            {
                                ++refPos;
                            }
                            continue;
                        }
                        
                        // Increment the count for this quality.
                        ++(qualCount[(int)(qual[index])]);
                        ++(phredCount[(int)(qual[index]) - PHRED_DIFF]);
                        // Update the position if this is found in the reference or a clip.
                        if(Cigar::foundInReference(cigarChar) || Cigar::isClip(cigarChar))
                        {
                            ++refPos;
                        }
                        ++index;
                    }
                }
            }

            // Check the next thing to do for the read.
            if((baseQCPtr != NULL) || baseSum)
            {
                // Pileup the bases for this read.
                pileup.processAlignmentRegion(samRecord, myStartPos, myEndPos, dbsnpListPtr);
            }
        }

        // Done with a section, move on to the next one.

        // New section, so flush the pileup.
        pileup.flushPileup();
    }

    // Flush the rest of the pileup.
    if((baseQCPtr != NULL) || baseSum)
    {
        // Pileup the bases.
        pileup.processAlignmentRegion(samRecord, myStartPos, myEndPos, dbsnpListPtr);
        PileupElementBaseQCStats::printSummary();
        ifclose(baseQCPtr);
    }

    std::cerr << "Number of records read = " << 
        samIn.GetCurrentRecordCount() << std::endl;

    if(basic)
    {
        std::cerr << std::endl;
        samIn.PrintStatistics();
    }

    // Print the quality stats.
    if(qual)
    {
        std::cerr << std::endl;
        std::cerr << "Quality\tCount\n";
        for(int i = START_QUAL; i <= MAX_QUAL; i++)
        {
            std::cerr << i << "\t" << qualCount[i] << std::endl;
        }
    }
    // Print the phred quality stats.
    if(phred)
    {
        std::cerr << std::endl;
        std::cerr << "Phred\tCount\n";
        for(int i = START_PHRED; i <= MAX_PHRED; i++)
        {
            std::cerr << i << "\t" << phredCount[i] << std::endl;
        }
    }

    SamStatus::Status status = samIn.GetStatus();
    if(status == SamStatus::NO_MORE_RECS)
    {
        // A status of NO_MORE_RECS means that all reads were successful.
        status = SamStatus::SUCCESS;
    }

    return(status);
}
コード例 #8
0
bool RegressionAnalysis::ReadModelsFromFile()
   {
   StringArray models;
   models.Read(modelsFile);

   if (models.Length() == 0)
      return false;

   regress = new FancyRegression[models.Length()];

   printf("Retrieving analysis models from file [%s]...\n",
          (const char *) modelsFile);

   modelCount = 0;

   StringArray tokens;
   for (int i = 0, line = 0; i < models.Length(); i++)
      {
      models[i].Trim();

      // Skip comments
      if (models[i][0] == '#') continue;

      // Divide each line into tokens
      tokens.Clear();
      tokens.AddTokens(models[i]);

      // Skip blank lines
      if (tokens.Length() == 0) continue;

      // Print message for tracing...
      printf("   Input: %s\n", (const char *) models[i], line++);

      // Need a minimum of four tokens per line
      if (tokens.Length() < 4)
         {
         printf(" Skipped: Trait name, mean, variance and heritability required.\n");
         continue;
         }

      regress[modelCount].trait = ped.LookupTrait(tokens[0]);

      if (regress[modelCount].trait < 0)
         {
         printf(line == 1 ? " Skipped: Appears to be a header line\n" :
                            " Skipped: Trait %s not listed in the data file\n",
                            (const char *) tokens[0]);
         continue;
         }

      // First check that mean, variance and heritability are valid numbers
      bool fail = false;
      for (int j = 1; j <= 3; j++)
         {
         char * ptr = NULL;
         strtod(tokens[j], &ptr);
         fail |= ptr[0] != 0;
         }

      // If one of the values is not a valid number, skip
      if (fail)
         {
         printf(line == 1 ? " Skipped: Appears to be a header line\n" :
                            " Skipped: Invalid numeric format\n");
         continue;
         }

      regress[modelCount].mean = tokens[1];
      regress[modelCount].variance = tokens[2];
      regress[modelCount].heritability = tokens[3];

      if (tokens.Length() > 4)
         {
         regress[modelCount].label = tokens[4];

         for (int j = 5; j < tokens.Length(); j++)
            {
            regress[modelCount].label += " ";
            regress[modelCount].label += tokens[j];
            }
         }
      else
         regress[modelCount].label.printf("Model %d", modelCount + 1);

      regress[modelCount].shortLabel = regress[modelCount].label;
      regress[modelCount].testRetestCorrel = testRetestCorrel;
      regress[modelCount].bounded = !unrestricted;

      printf("        Model loaded and labelled %s\n", (const char *) regress[modelCount].label);

      modelCount++;
      }

   if (modelCount == 0)
      {
      printf("No valid models, default model will be used\n\n");
      return false;
      }

   printf("Table processed. %d models recognized\n\n", modelCount);

   return true;
   }
コード例 #9
0
void GroupFromAnnotation::vcfInitialize()
{
	// func_upper
	if ( function != "" ) {
		func_upper.AddTokens( function, "/" );
		for( int i=0; i<func_upper.Length(); i++ )
			func_upper[i] = func_upper[i].ToUpper();
	}

	FILE * inFile;
	inFile = fopen(vcfInput,"r");
	while (!feof(inFile)) {
		String buffer;
		buffer.ReadLine( inFile);
		if ( buffer[0] == '#' )
			continue;
		StringArray vfield;
		vfield.AddTokens(buffer, "\t");
		if ( vfield.Length() < 8 )
			error("Annotation vcf only has %d columns!\n", vfield.Length());
		StringArray info_semicolon;
		info_semicolon.AddTokens( vfield[7],";" );
		
		int annofull_index = -1;
		for( int i=0; i<info_semicolon.Length(); i++ ) {
			String iheader = info_semicolon[i].SubStr(0,8);
			if (iheader == "ANNOFULL") {
				annofull_index = i;
				break;
			}
		}
		if (annofull_index == -1)
			continue;
		String anno_full_str = info_semicolon[annofull_index].SubStr(9);
		StringArray alts;
		alts.AddTokens( anno_full_str, "|" );
		for( int a=0; a<alts.Length(); a++ ) {
			StringArray sub;
			sub.AddTokens( alts[a], ":/=");
			if (func_upper.Length() != 0) { // match before add
				for(int f =0;f<func_upper.Length();f++) {
					bool pattern_match = checkPatternMatch( sub, func_upper[f] );
					if ( pattern_match ) {
						chrom.Push( vfield[0] );
						addGeneToGroupHash( sub[0] );
						break;
					}
				}
			}
			else { // no pattern to match
				chrom.Push( vfield[0] );
				addGeneToGroupHash( sub[0] );		
			}
		}
	}

// vectors	
	SNPlist = new StringArray [geneCount];
	SNPNoAllele = new StringArray [geneCount];
	pos = new Vector [geneCount];
	
}
コード例 #10
0
void GroupFromAnnotation::GetGeneMap(String path)
{
   IFILE genemap;
   genemap =  ifopen(mapFile,"r");
   if(genemap==NULL)
   {
      if(mapFile=="../data/refFlat_hg19.txt")
      {
	 mapFile += ".gz";

	 genemap = ifopen(mapFile,"r");
	 if(genemap==NULL)
	 {
	    int loc = path.Find("bin");
	    if(loc!=-1)
	    {
	       mapFile = path.Left(loc-1);
	       mapFile += "/data/refFlat_hg19.txt";
	    }
	    else
	    {
	       mapFile += "../data/refFlat_hg19.txt";
	    }
	    genemap = ifopen(mapFile,"r");
	 }
	 if(genemap==NULL)
	 {
	    mapFile += ".gz";
	    genemap = ifopen(mapFile,"r");
	 }
	 if(genemap==NULL)
	    error("Cannot open gene mapping file %s.\n",mapFile.c_str());

      }
      else
	 error("Cannot open gene mapping file %s.\n",mapFile.c_str());
   }
   StringIntHash GeneLocHash;
   StringArray strand;
   int gene_idx =0;

   while(!ifeof(genemap))
   {
      String buffer;
      buffer.ReadLine(genemap);
      StringArray record;
      record.AddTokens(buffer,"\t");
      int loc = GeneLocHash.Integer(record[0]);
      if(loc==-1)
      {
	 GeneLocHash.SetInteger(record[0],gene_idx);
	 //save chr, start and end positions
	 StringArray gene_chr;
	 if(record[2][2]=='r' || record[2][2]=='R')
	    record[2] = record[2].SubStr(3);
	 gene_chr.AddTokens(record[2],"_,;.");
	 if(gene_chr[0].Find("Un")!=-1)
	    continue;
	 /*
	    if(ChrLocHash.Integer(gene_chr[0])==-1)
	    {
	    chr_count++;
	    unique_chr.Push(gene_chr[0]);
	    ChrLocHash.SetInteger(gene_chr[0],chr_count);
	    }
	  */
	 chr.Push(gene_chr[0]);
	 //printf("%d\t%s\t%s\n",idx,record[0].c_str(),gene_chr[0].c_str());
	 start_pos.Push(record[4].AsInteger());
	 end_pos.Push(record[5].AsInteger());
	 strand.Push(record[3]);
	 genename.Push(record[0]);
	 gene_idx++;
      }
      else
      {
	 //get the current chr
	 StringArray gene_chr;
	 if(record[2][2]=='r' || record[2][2]=='R')
	    record[2] = record[2].SubStr(3);
	 gene_chr.AddTokens(record[2],"_,;.");
	 if(gene_chr[0].Find("Un")!=-1)
	    continue;
	 //check if strand and chr are consistent with previous record
	 if(chr[loc]!=gene_chr[0]) 
	    //if(strand[loc]!=record[3] || chr[loc]!=gene_chr[0]) 
	    //    printf("Gene %s in %s has multiple records in different chromosome or strand.\n",record[0].c_str(),mapFile.c_str());
	    continue;
	 //update start and end position
	 if(record[4].AsInteger()<start_pos[loc])
	    start_pos[loc] = record[4].AsInteger();
	 if(record[5].AsInteger()>end_pos[loc])
	    end_pos[loc] = record[5].AsInteger();
      }
   }
   ifclose(genemap);
   //ifclose(genemap);
   chr_idx.Index(chr);
   String chr_=chr[chr_idx[0]];
   for(int i=1;i<chr.Length();i++)
   {
      if(chr[chr_idx[i]]!=chr_)
      {
	 ChrStartHash.SetInteger(chr[chr_idx[i]],i);
	 ChrEndHash.SetInteger(chr_,i-1);
	 chr_ = chr[chr_idx[i]];
      }
   }
}
コード例 #11
0
void GroupFromAnnotation::GetGroupFromFile(FILE * log)
{
   //Fill in annoGroups.
   StringArray tmp;
   FILE * file = fopen(groupFile,"r");
   if(file==NULL)
   {
      printf("ERROR! Cannot open group file %s.\n",groupFile.c_str());
      error("ERROR! Cannot open group file %s.\n",groupFile.c_str());
   }
   String buffer;
   int line = 0;
   while (!feof(file))
   {
      buffer.ReadLine(file);
      tmp.Clear();
      tmp.AddTokens(buffer, SEPARATORS);
      if(tmp.Length()==0)
	 continue;
      annoGroups.Push(tmp[0]);
      chrom.Push(tmp[1]);
      line++;
   }
   fclose(file);

   //Fill in SNPlist.
   SNPlist = new StringArray [line];
   SNPNoAllele = new StringArray [line];
   FILE * samefile = fopen(groupFile,"r");
   line = 0;
   Vector pos;
   while (!feof(samefile))
   {
      buffer.ReadLine(samefile);
      tmp.Clear();
      pos.Clear();
      tmp.AddTokens(buffer, "\t ");
      SNPlist[line].Dimension(0);
      SNPNoAllele[line].Dimension(0);
      for(int i=1;i<tmp.Length();i++)
      {
	 SNPlist[line].Push(tmp[i]);
	 StringArray sub;
	 sub.Clear();
	 sub.AddTokens(tmp[i],":_/");
	 if(sub.Length()!=4)
	 {
	    printf("Warning: group %s has a variant %s that has invalid format. The correct format should be chr:pos:allele1:allele2.\n",tmp[0].c_str(),tmp[i].c_str());
	    fprintf(log,"Warning: group %s has a variant %s that has invalid format. The correct format should be chr:pos:allele1:allele2.\n",tmp[0].c_str(),tmp[i].c_str());
	    continue;
	 }
	 pos.Push(sub[1].AsInteger());
	 SNPNoAllele[line].Push(sub[0] + ":" + sub[1]);
      }
      //sort SNPlist[line] and SNPNoAllele[line]
      if(SNPlist[line].Length()>1)
      {
	 Vector sorted_pos,order;
	 sorted_pos.Copy(pos);
	 sorted_pos.Sort();
	 order.Dimension(pos.Length());
	 for(int i=0;i<sorted_pos.Length();i++)
	 {
	    for(int j=0;j<pos.Length();j++)
	    {
	       if(sorted_pos[i]==pos[j])
	       {
		  order[i]=j; 
		  break;
	       }
	    }
	 }

	 StringArray cp_SNPlist,cp_SNPNoAllele;
	 cp_SNPlist.Dimension(SNPlist[line].Length());
	 cp_SNPNoAllele.Dimension(SNPNoAllele[line].Length());
	 for(int l=0;l<SNPlist[line].Length();l++)
	 {
	    cp_SNPlist[l] = SNPlist[line][l];
	    cp_SNPNoAllele[l] = SNPNoAllele[line][l];
	 }
	 for(int i=0;i<order.Length();i++)
	 {
	    SNPlist[line][i] = cp_SNPlist[order[i]];
	    //printf("%s\t",SNPlist[line][i].c_str());
	    SNPNoAllele[line][i] = cp_SNPNoAllele[order[i]] ;
	 }
	 //printf("\n");
      }
      line++;
   }
   fclose(samefile);
}