コード例 #1
0
ファイル: Main.cpp プロジェクト: gpcr/rvtests
int loadMatrix(Matrix& a, String& fileName) {
  a.Zero();

  IFILE ifile(fileName.c_str(), "r");
  String line;
  StringArray array;
  int lineNo = 0;
  while (!ifeof(ifile)) {
    line.ReadLine(ifile);
    lineNo++;
    if (line.Length() == 0) continue;
    array.Clear();
    array.AddTokens(line);
    if (a.cols != 0 && a.cols != array.Length() && line.Length() > 0) {
      fprintf(stderr, "Wrong column size at line %d!\n", lineNo);
      array.Print();
      line.Write(stdout);
      return -1;
    } else {
      a.GrowTo(a.rows, array.Length());
    }
    if (a.rows < lineNo) {
      a.GrowTo(a.rows + 1, a.cols);
    }
    for (int i = 0; i < array.Length(); i++) {
      a[lineNo - 1][i] = atol(array[i]);
    }
  }

  // a.Print(stdout);
  return 0;
};
コード例 #2
0
ファイル: VcfFile.cpp プロジェクト: aminzia/statgen
void VcfHelper::printArrayDoubleJoin(IFILE oFile, const StringArray& arr1, const StringArray& arr2, const char* sep1, const char* sep2, const char* empty) {
  int len1 = arr1.Length();
  int len2 = arr2.Length();

  if ( len1 == len2 ) {
    printArrayDoubleJoin(oFile, arr1, arr2, sep1, sep2, empty, 0, len1);
  }
  else {
    throw VcfFileException("Inconsistency between arr1.Length() == %d and arr2.Length() == %d", len1, len2);
  }
}
コード例 #3
0
void GroupFromAnnotation::addLineFromVcf( String & buffer )
{
// sample:
// ANNO=Nonsynonymous:ASB16;
// ANNOFULL=ASB16/NM_080863:+:Nonsynonymous(CCC/Pro/P->ACC/Thr/T:Base1310/1362:Codon437/454:Exon5/5):Exon
//	|C17orf65/NM_178542:-:Intron

	StringArray vfield;
	vfield.AddTokens(buffer, "\t");
	if ( vfield.Length() < 8 )
		error("Annotation vcf only has %d columns!\n", vfield.Length());
	StringArray info_semicolon;
	info_semicolon.AddTokens( vfield[7],";" );
	
// find ANNOFULL first
	int annofull_index = -1;
	for( int i=0; i<info_semicolon.Length(); i++ ) {
		String iheader = info_semicolon[i].SubStr(0,8);
		if (iheader == "ANNOFULL") {
			annofull_index = i;
			break;
		}
	}
	if (annofull_index == -1) {
		printf("warning: no ANNOFULL field at chr%s:%s. Variant won't included in groups!\n", info_semicolon[0].c_str(), info_semicolon[1].c_str());
		return;
	}

// remove ANNOFULL=
	String anno_full_str = info_semicolon[annofull_index].SubStr(9);

// check each alternative field
	StringArray alts;
	alts.AddTokens( anno_full_str, "|" );
	for( int a=0; a<alts.Length(); a++ ) {
		StringArray sub;
		sub.AddTokens( alts[a], ":/=");
		if (func_upper.Length() != 0) { // match before add
			for(int f =0;f<func_upper.Length();f++) {
				bool pattern_match = checkPatternMatch( sub, func_upper[f] );
				if ( pattern_match ) {
					addGeneFromVcf( vfield, sub[0] );
					break;
				}
			}
		}
		else { // no pattern to match: check if intergenic first
			String upper_name = sub[0].ToUpper();
			if ( !upper_name.SlowFind( "INTERGENIC" ) )
				addGeneFromVcf( vfield, sub[0] );
		}
	}	
}
コード例 #4
0
ファイル: Main.cpp プロジェクト: aminzia/statgen
void StringToArray(const String & input, IntArray & values, int desired)
   {
   StringArray tokens;
   tokens.AddTokens(input, ',');

   values.Dimension(desired);
   values.Zero();

   if (tokens.Length())
      for (int i = 0; i < desired; i++)
         values[i] = tokens[i % tokens.Length()].AsInteger();
   }
コード例 #5
0
void GenomeRegionSeqStats::LoadRegionList(String &inputList)
{
  FILE *in = fopen(inputList.c_str(), "r");
  if(in==NULL) error("Open region input file %s failed!\n", inputList.c_str());
  StringArray tokens;
  String buffer;
  while(!feof(in))
    {
      buffer.ReadLine(in);
      if (buffer.IsEmpty() || buffer[0] == '#') continue;
      tokens.ReplaceTokens(buffer);
      if(tokens.Length()<3)
	error("Too few columns: %s\n", buffer.c_str());
      
      String CSE = tokens[0]+":"+tokens[1]+":"+tokens[2];
      std::pair<int, int> start_end;
      start_end.first = tokens[1].AsInteger();
      start_end.second = tokens[2].AsInteger();
      if(start_end.first>=start_end.second) // positions are 0-based. Otherwise == is valid
      	error("Region end is equal or smaller than the start: %s!\n", buffer.c_str());      
      genomeRegions_lines[tokens[0]].push_back(buffer);
      genomeRegions[tokens[0]].push_back(start_end);
      genomeRegions_currentIndex[tokens[0]] = 0; 

      if(tokens.Length()>3) {
	groupStats[tokens[3]].segCount++;
	groupStats[tokens[3]].totalLen += (start_end.second - start_end.first);
	genomeRegionGroups[CSE].push_back(tokens[3]);
      }
    }
  
  fclose(in);
  
  // Chromosome info
  contigs.clear();
  std::map<String, vector<std::pair<int, int> > >::iterator p;
  for(p=genomeRegions.begin(); p!=genomeRegions.end(); p++)
    {
      contigs.push_back(p->first);
      for(unsigned int i=1; i<genomeRegions[p->first].size(); i++)
	if(genomeRegions[p->first][i].first<genomeRegions[p->first][i-1].first)
	  error("Input coordinates are not in order: %s %d %d!\n", p->first.c_str(),genomeRegions[p->first][i].first,genomeRegions[p->first][i].second);
    }
  // Group info such as gene names
  groups.clear();
  std::map<String, Stats>::iterator p2;
  for(p2=groupStats.begin(); p2!=groupStats.end(); p2++)
    groups.push_back(p2->first);
}
コード例 #6
0
ファイル: Main.cpp プロジェクト: gpcr/rvtests
int loadVector(Vector& a, String& fileName) {
  a.Zero();

  IFILE ifile(fileName.c_str(), "r");
  String line;
  StringArray array;
  int lineNo = 0;
  while (!ifeof(ifile)) {
    line.ReadLine(ifile);
    lineNo++;
    if (line.Length() == 0) continue;
    array.Clear();
    array.AddTokens(line);
    if (array.Length() > 1 && line.Length() > 0) {
      fprintf(stderr, "Warning: column size at line %d!\n", lineNo);
      array.Print();
      line.Write(stdout);
      return -1;
    }
    if (a.dim < lineNo) {
      a.GrowTo(a.dim + 1);
    }
    a[lineNo - 1] = atol(array[0]);
  }

  // a.Print(stdout);

  return 0;
};
コード例 #7
0
ファイル: GCContent.cpp プロジェクト: BioInfoTools/qplot
void GCContent::LoadRegions(String & regionsFile, GenomeSequence &genome, bool invertRegion)
{
    if(regionsFile.Length()==0) return;
    if(genome.sequenceLength()==0) error("No reference genome loaded!\n");

    IFILE fhRegions;
    fhRegions = ifopen(regionsFile.c_str(),"r");
    if(fhRegions==NULL)
        error("Open regions file %s failed!\n", regionsFile.c_str());

    regionIndicator.resize(genome.sequenceLength());

    StringArray tokens;
    String buffer;
    int len;

    fprintf(stderr, "Loading region list...");

    while (!ifeof(fhRegions)){
        buffer.ReadLine(fhRegions);
        if (buffer.IsEmpty() || buffer[0] == '#') continue;

        tokens.AddTokens(buffer, WHITESPACE);
        if(tokens.Length() < 3) continue;

        genomeIndex_t startGenomeIndex = 0;
        int chromosomeIndex = tokens[1].AsInteger();

        // use chromosome name (token[0]) and position (token[1]) to query genome index.
        startGenomeIndex = genome.getGenomePosition(tokens[0].c_str(), chromosomeIndex);

        if(startGenomeIndex >= regionIndicator.size() ) {
            //fprintf(stderr, "WARNING: region list section %s position %u is not found in the reference and skipped...\n", tokens[0].c_str(), chromosomeIndex);
            continue;
        }

        len = tokens[2].AsInteger() - tokens[1].AsInteger() + 1;
        for(uint32_t i=startGenomeIndex; i<startGenomeIndex+len; i++)
            regionIndicator[i] = true;

        tokens.Clear();
        buffer.Clear();
    }

    if (invertRegion) {
        fprintf(stderr, " invert region...");
        for (uint32_t i = 0; i < regionIndicator.size(); i++) {
            regionIndicator[i] = !regionIndicator[i];
        }
    }

    ifclose(fhRegions);
    fprintf(stderr, "DONE!\n");
}
コード例 #8
0
ファイル: VcfFile.cpp プロジェクト: aminzia/statgen
void VcfHelper::printArrayJoin(IFILE oFile, const StringArray& arr, const char* sep, const char* empty) {
  int len = arr.Length();
  if ( len == 0 ) {
    ifprintf(oFile,"%s",empty);
  }
  else if ( len == 1 ) {
    ifprintf(oFile,"%s",arr[0].c_str());
  }
  else {
    printArrayJoin(oFile,arr,sep,empty,0,len);
  }
}
コード例 #9
0
bool MarkovParameters::ReadCrossoverRates(const char * filename)
   {
   StringArray tokens;
   StringArray rec;
   rec.Read(filename);

   // Load estimated per marker error rates
   if (rec.Length() == markers)
      {
      printf("  Updating error rates using data in %s ...\n", (const char *) filename);
      for (int i = 0; i < markers; i++)
         {
         tokens.ReplaceTokens(rec[i+1]);

         if (tokens.Length() >= 2) R[i] = tokens[1].AsDouble();
         }

      return true;
      }

   return false;
   }
コード例 #10
0
bool GroupFromAnnotation::checkPatternMatch( StringArray & sub, String & func )
{
	int result = -1;
	for( int i=0; i<sub.Length(); i++ ) {
		if ( sub[i].Length() < func.Length() )
			continue;
		String str = sub[i];
		String upper_sub = str.ToUpper();
		result = upper_sub.SlowFind( func );
		if ( result == 0 )
			break;
	}
	if ( result == 0 )
		return 1;
	else
		return 0;
}
コード例 #11
0
ファイル: SamHeaderRecord.cpp プロジェクト: luyi0629/Minimac3
// Set the fields from the passed in line.
// Return true if successfully set.
bool SamHeaderRecord::setFields(const StringArray& tokens)
{
    bool status = true;

    // Loop through the tags for this type.
    // The tags start in column 1 since column 0 contains the type.
    for(int columnIndex = 1; columnIndex < tokens.Length(); columnIndex++)
    {
        // Validate that the tag is at least 3 characters. Two for the token,
        // one for the ':'.
        if((tokens[columnIndex].Length() < 3) ||
                (tokens[columnIndex][2] != ':'))
        {
            // Continue to the next tag, this one is too small/invalid.
            status = false;
            std::cerr << "ERROR: Poorly formatted tag in header: "
                      << tokens[columnIndex] << std::endl;
            continue;
        }

        // Get the tag from the token.
        char tag[3];
        tag[0] = tokens[columnIndex][0];
        tag[1] = tokens[columnIndex][1];
        tag[2] = 0;

        // The tag value is the rest of the substring.
        String tagValue = (tokens[columnIndex]).SubStr(3);

        // Set the tag.
        status &= setTag(tag, tagValue.c_str());
    }

    status &= isValid();

    return(status);
}
コード例 #12
0
ファイル: VcfMac.cpp プロジェクト: statgen/vcfUtil
int VcfMac::execute(int argc, char **argv)
{
    String inputVcf = "";
    int minAC = -1;
    String sampleSubset = "";
    String filterList = "";
    bool params = false;

    IntervalTree<int> regions;
    std::vector<int> intersection;
    
    // Read in the parameters.    
    ParameterList inputParameters;
    BEGIN_LONG_PARAMETERS(longParameterList)
        LONG_PARAMETER_GROUP("Required Parameters")
        LONG_STRINGPARAMETER("in", &inputVcf)
        LONG_PARAMETER_GROUP("Optional Parameters")
        LONG_STRINGPARAMETER("sampleSubset", &sampleSubset)
        LONG_INTPARAMETER("minAC", &minAC)
        LONG_STRINGPARAMETER("filterList", &filterList)
        LONG_PARAMETER("params", &params)
        LONG_PHONEHOME(VERSION)
        END_LONG_PARAMETERS();
   
    inputParameters.Add(new LongParameters ("Input Parameters", 
                                            longParameterList));
    
    inputParameters.Read(argc-1, &(argv[1]));
    
    // Check that all files were specified.
    if(inputVcf == "")
    {
        usage();
        inputParameters.Status();
        std::cerr << "Missing \"--in\", a required parameter.\n\n";
        return(-1);
    }

    if(params)
    {
        inputParameters.Status();
    }

    // Open the two input files.
    VcfFileReader inFile;
    VcfHeader header;
    VcfRecord record;

    // Open the file
    if(sampleSubset.IsEmpty())
    {
        inFile.open(inputVcf, header);        
    }
    else
    {
        inFile.open(inputVcf, header, sampleSubset, NULL, NULL);
    }
    
    // Add the discard rule for minor allele count.
    if(minAC >= 0)
    {
        inFile.addDiscardMinMinorAlleleCount(minAC, NULL);
    }
    
    if(!filterList.IsEmpty())
    {
        // Open the filter list.
        IFILE regionFile = ifopen(filterList, "r");
        String regionLine;
        StringArray regionColumn;
        int start;
        int end;
        int intervalVal = 1;
        if(regionFile == NULL)
        {
            std::cerr << "Failed to open " << filterList 
                      << ", so keeping all positions\n";
            filterList.Clear();
        }
        else
        {
            while( regionFile->isOpen() && !regionFile->ifeof())
            {
                // Read the next interval
                regionLine.Clear();
                regionLine.ReadLine(regionFile);
                if(regionLine.IsEmpty())
                {
                    // Nothing on this line, continue to the next.
                    continue;
                }
                regionColumn.ReplaceColumns(regionLine, ' ');
                if(regionColumn.Length() != 2)
                {
                    std::cerr << "Improperly formatted region line: " 
                              << regionLine << "; skipping to the next line.\n";
                    continue;
                }
                // Convert the columns to integers.
                if(!regionColumn[0].AsInteger(start))
                {
                    // The start position (1st column) is not an integer.
                    std::cerr << "Improperly formatted region line, start position "
                              << "(1st column) is not an integer: "
                              << regionColumn[0]
                              << "; Skipping to the next line.\n";
                    continue;
                }
                if(!regionColumn[1].AsInteger(end))
                {
                    // The start position (1st column) is not an integer.
                    std::cerr << "Improperly formatted region line, end position "
                              << "(2nd column) is not an integer: "
                              << regionColumn[1]
                              << "; Skipping to the next line.\n";
                    continue;
                }
                // Add 1-based inclusive intervals.
                regions.add(start,end, intervalVal);
            }
        }
    }


    int numReadRecords = 0;

    while( inFile.readRecord(record))
    {
        if(!filterList.IsEmpty())
        {
            // Check if the region should be kept.
            intersection.clear();
            regions.get_intersecting_intervals(record.get1BasedPosition(), intersection);
            
            if(intersection.empty())
            {
                // not in the interval, so continue to the next record.
                continue;
            }
        }

        ++numReadRecords;

        // Loop through the number of possible alternates.
        unsigned int numAlts = record.getNumAlts();
        int minAlleleCount = -1;
        int curAlleleCount = 0;
        int totalAlleleCount = 0;
        for(unsigned int i = 0; i <= numAlts; i++)
        {
            curAlleleCount = record.getAlleleCount(i);
            if((minAlleleCount == -1) ||
               (curAlleleCount < minAlleleCount))
            {
                minAlleleCount = curAlleleCount;
            }
            totalAlleleCount += curAlleleCount;
        }
        if(totalAlleleCount != 0)
        {
            double maf = (double)minAlleleCount/totalAlleleCount;
            std::cout << record.getIDStr()
                      << "\t" << minAlleleCount
                      << "\t" << maf << "\n";
        }
    }
    
    inFile.close();

    //    std::cerr << "\n\t# Records: " << numReadRecords << "\n";

    // return success.
    return(0);
}
コード例 #13
0
void GenomeRegionSeqStats::CalcRegionStats(StringArray &bamFiles)
{
 for(int i=0; i<bamFiles.Length(); i++) 
   CalcRegionStats(bamFiles[i]);
}
コード例 #14
0
void GenomeRegionSeqStats::CalcClusters(StringArray &bamFiles, int minMapQuality)
{
 for(int i=0; i<bamFiles.Length(); i++) 
   CalcClusters(bamFiles[i], minMapQuality);
}
コード例 #15
0
void GroupFromAnnotation::GetGroupFromFile(FILE * log)
{
   //Fill in annoGroups.
   StringArray tmp;
   FILE * file = fopen(groupFile,"r");
   if(file==NULL)
   {
      printf("ERROR! Cannot open group file %s.\n",groupFile.c_str());
      error("ERROR! Cannot open group file %s.\n",groupFile.c_str());
   }
   String buffer;
   int line = 0;
   while (!feof(file))
   {
      buffer.ReadLine(file);
      tmp.Clear();
      tmp.AddTokens(buffer, SEPARATORS);
      if(tmp.Length()==0)
	 continue;
      annoGroups.Push(tmp[0]);
      chrom.Push(tmp[1]);
      line++;
   }
   fclose(file);

   //Fill in SNPlist.
   SNPlist = new StringArray [line];
   SNPNoAllele = new StringArray [line];
   FILE * samefile = fopen(groupFile,"r");
   line = 0;
   Vector pos;
   while (!feof(samefile))
   {
      buffer.ReadLine(samefile);
      tmp.Clear();
      pos.Clear();
      tmp.AddTokens(buffer, "\t ");
      SNPlist[line].Dimension(0);
      SNPNoAllele[line].Dimension(0);
      for(int i=1;i<tmp.Length();i++)
      {
	 SNPlist[line].Push(tmp[i]);
	 StringArray sub;
	 sub.Clear();
	 sub.AddTokens(tmp[i],":_/");
	 if(sub.Length()!=4)
	 {
	    printf("Warning: group %s has a variant %s that has invalid format. The correct format should be chr:pos:allele1:allele2.\n",tmp[0].c_str(),tmp[i].c_str());
	    fprintf(log,"Warning: group %s has a variant %s that has invalid format. The correct format should be chr:pos:allele1:allele2.\n",tmp[0].c_str(),tmp[i].c_str());
	    continue;
	 }
	 pos.Push(sub[1].AsInteger());
	 SNPNoAllele[line].Push(sub[0] + ":" + sub[1]);
      }
      //sort SNPlist[line] and SNPNoAllele[line]
      if(SNPlist[line].Length()>1)
      {
	 Vector sorted_pos,order;
	 sorted_pos.Copy(pos);
	 sorted_pos.Sort();
	 order.Dimension(pos.Length());
	 for(int i=0;i<sorted_pos.Length();i++)
	 {
	    for(int j=0;j<pos.Length();j++)
	    {
	       if(sorted_pos[i]==pos[j])
	       {
		  order[i]=j; 
		  break;
	       }
	    }
	 }

	 StringArray cp_SNPlist,cp_SNPNoAllele;
	 cp_SNPlist.Dimension(SNPlist[line].Length());
	 cp_SNPNoAllele.Dimension(SNPNoAllele[line].Length());
	 for(int l=0;l<SNPlist[line].Length();l++)
	 {
	    cp_SNPlist[l] = SNPlist[line][l];
	    cp_SNPNoAllele[l] = SNPNoAllele[line][l];
	 }
	 for(int i=0;i<order.Length();i++)
	 {
	    SNPlist[line][i] = cp_SNPlist[order[i]];
	    //printf("%s\t",SNPlist[line][i].c_str());
	    SNPNoAllele[line][i] = cp_SNPNoAllele[order[i]] ;
	 }
	 //printf("\n");
      }
      line++;
   }
   fclose(samefile);
}
コード例 #16
0
ファイル: FilterStat.cpp プロジェクト: aminzia/statgen
bool FilterStat::appendStatVcf(const char* file) {
  VcfFile vcf;
  vcf.setSiteOnly(false);
  vcf.setParseValues(true);
  vcf.setParseGenotypes(false);
  vcf.setParseDosages(false);
  vcf.openForRead(file,1);
  
  VcfMarker* pMarker;
  StringArray tok;
  for( int i=0, j=0; vcf.iterateMarker(); ++i, ++j ) {
    pMarker = vcf.getLastMarker();
    if ( sChrom.Compare(pMarker->sChrom) != 0 ) {
      Logger::gLogger->error("Chromosome name does not match - %s vs %s",sChrom.c_str(),pMarker->sChrom.c_str());
    }

    while ( vPos[j] < pMarker->nPos ) { ++j; }

    if ( vPos[j] > pMarker->nPos ) {
      Logger::gLogger->error("Position %s:%d is not observed in the anchor VCF",sChrom.c_str(),pMarker->nPos);
    }

    std::vector<int> vAlleles;
    std::vector<int> vStrands;

    //fprintf(stderr,"%s:%d\t%s\n",pMarker->sChrom.c_str(),pMarker->nPos,pMarker->asFormatKeys[0].c_str());

    for(int k=0; k < pMarker->asFormatKeys.Length(); ++k) {
      if ( pMarker->asFormatKeys[k].Compare("BASE") == 0 ) {
	tok.ReplaceColumns(pMarker->asSampleValues[k],',');
	for(int l=0; l < tok.Length(); ++l) {
	  if ( tok[l].Compare(vAl1[j].c_str()) == 0 ) {
	    vAlleles.push_back(0);
	  }
	  else if ( tok[l].Compare(vAl2[j].c_str()) == 0 ) {
	    vAlleles.push_back(1);
	  }
	  else {
	    vAlleles.push_back(2);
	  }
	}
      }
      else if ( pMarker->asFormatKeys[k].Compare("STRAND") == 0 ) {
	tok.ReplaceColumns(pMarker->asSampleValues[k],',');
	for(int l=0; l < tok.Length(); ++l) {
	  if ( tok[l].Compare("F") == 0 ) {
	    vStrands.push_back(0);
	  }
	  else {
	    vStrands.push_back(1);
	  }
	}
      }
    }

    //fprintf(stderr,"%s:%d\t%d",pMarker->sChrom.c_str(),pMarker->nPos,(int)vAlleles.size());
    //for(int k=0; k < (int) vAlleles.size(); ++k) {
    //  fprintf(stderr,"\t%d",vAlleles[k]*2+vStrands[k]);
    //}
    //fprintf(stderr,"\n");

    if ( vAlleles.size() != vStrands.size() ) {
      Logger::gLogger->error("Alleles and Strands do not match in size at %s:%d, in %s",pMarker->sChrom.c_str(), pMarker->nPos, file);
    }

    // updates the counts - needs synchronization
    {
      //boost::mutex::scoped_lock lock(mutex);
      for(int k=0; k < (int) vAlleles.size(); ++k) {
	++(vCounts[FILTER_STAT_COUNTS*j + vAlleles[k]*2 + vStrands[k]]);
      }
    }
  }
  return true;
}
コード例 #17
0
ファイル: Main.cpp プロジェクト: cfuchsberger/minimac
int main(int argc, char ** argv)
   {
   setbuf(stdout, NULL);

   time_t start = time(NULL);

   printf("MiniMac - Imputation into phased haplotypes\n"
          "(c) 2011 Goncalo Abecasis\n");
#ifdef __VERSION__
   printf("VERSION 5.0\n");
#else
   printf("UNDOCUMENTED RELEASE\n");
#endif

   int rounds = 5, states = 200, cpus = 0;
   bool em = false, gzip = false, phased = false;

   String referenceHaplotypes, referenceSnps;
   String haplotypes, snps;
   String prefix("minimac");
   String firstMarker, lastMarker;

   String recombinationRates, errorRates;

   BEGIN_LONG_PARAMETERS(longParameters)
      LONG_PARAMETER_GROUP("Reference Haplotypes")
         LONG_STRINGPARAMETER("refHaps", &referenceHaplotypes)
         LONG_STRINGPARAMETER("refSnps", &referenceSnps)
      LONG_PARAMETER_GROUP("Target Haplotypes")
         LONG_STRINGPARAMETER("haps", &haplotypes)
         LONG_STRINGPARAMETER("snps", &snps)
      LONG_PARAMETER_GROUP("Starting Parameters")
         LONG_STRINGPARAMETER("rec", &recombinationRates)
         LONG_STRINGPARAMETER("erate", &errorRates)
      LONG_PARAMETER_GROUP("Parameter Fitting")
         LONG_INTPARAMETER("rounds", &rounds)
         LONG_INTPARAMETER("states", &states)
         LONG_PARAMETER("em", &em)
      LONG_PARAMETER_GROUP("Output Files")
         LONG_STRINGPARAMETER("prefix", &prefix)
         LONG_PARAMETER("phased", &phased)
         LONG_PARAMETER("gzip", &gzip)
//    LONG_PARAMETER_GROUP("Clipping Window")
//      LONG_STRINGPARAMETER("start", &firstMarker)
//      LONG_STRINGPARAMETER("stop", &lastMarker)
#ifdef _OPENMP
      LONG_PARAMETER_GROUP("Multi-Threading")
         LONG_INTPARAMETER("cpus", &cpus)
#endif
   END_LONG_PARAMETERS();

   ParameterList pl;

   pl.Add(new LongParameters("Command Line Options", longParameters));
   pl.Read(argc, argv);
   pl.Status();

#ifdef _OPENMP
   if (cpus > 0)
      omp_set_num_threads(cpus);
#endif

   // Read marker list
   printf("Reading Reference Marker List ...\n");

   StringArray refMarkerList;
   refMarkerList.Read(referenceSnps);

   // Index markers
   StringIntHash referenceHash;
   for (int i = 0; i < refMarkerList.Length(); i++)
      referenceHash.Add(refMarkerList[i].Trim(), i);

   printf("  %d Markers in Reference Haplotypes...\n\n", refMarkerList.Length());

   // Load reference haplotypes
   printf("Loading reference haplotypes ...\n");
   HaplotypeSet reference;

   reference.markerCount = refMarkerList.Length();
   reference.LoadHaplotypes(referenceHaplotypes);

   printf("  %d Reference Haplotypes Loaded ...\n\n", reference.count);

   // Read framework marker list
   printf("Reading Framework Marker List ...\n");
   StringArray markerList;
   markerList.Read(snps);

   ClipReference(reference, refMarkerList, referenceHash, markerList,
                 firstMarker, lastMarker);

   // Crossref Marker Names to Reference Panel Positions
   IntArray markerIndex;
   markerIndex.Dimension(markerList.Length());

   int matches = 0;

   for (int i = 0; i < markerList.Length(); i++)
      {
      markerIndex[i] = referenceHash.Integer(markerList[i].Trim());

      if (markerIndex[i] >= 0) matches++;
      }

   printf("  %d Markers in Framework Haplotypes Overlap Reference ...\n", matches);

   if (matches == 0)
      error("No markers overlap between target and reference\n"
            "Please check correct reference is being used and markers are named consistently");

   printf("  %d Other Markers in Framework Haplotypes Discarded ...\n\n", markerList.Length() - matches);

   // Check for flips in reference vs. target haplotypes
   int flips = 0;
   int previous = -1;
   for (int i = 0; i < markerIndex.Length(); i++)
      if (markerIndex[i] >= 0)
         if (markerIndex[i] < previous)
            {
            if (flips++ < 10)
               printf("  -> Marker %s precedes %s in reference, but follows it in target\n",
                     (const char *) refMarkerList[previous],
                     (const char *) markerList[i]);
            previous = markerIndex[i];
            }
   if (flips > 10)
      printf("  -> %d Additional Marker Order Changes Not Listed\n", flips - 10);
   if (flips)
      printf("  %d Marker Pairs Change Order in Target vs Framework Haplotypes\n", flips);

   // Load target haplotypes
   printf("Loading target haplotypes ...\n");
   HaplotypeSet target;

   target.markerCount = markerList.Length();
   target.LoadHaplotypes(haplotypes, true);

   reference.CalculateFrequencies();
   target.CalculateFrequencies();
   target.CompareFrequencies(reference, markerIndex, markerList);

   printf("  %d Target Haplotypes Loaded ...\n\n", target.count);

   int startIndex = firstMarker.IsEmpty() ? 0 : referenceHash.Integer(firstMarker);
   int stopIndex = lastMarker.IsEmpty() ? reference.markerCount - 1 : referenceHash.Integer(lastMarker);

   if (startIndex < 0 || stopIndex < 0)
      error("Clipping requested, but no position available for one of the endpoints");

   printf("Setting up Markov Model...\n\n");

   // Setup Markov Model
   MarkovParameters mp;

   mp.Allocate(reference.markerCount);

   if (rounds > 0)
      printf("Initializing Model Parameters (using %s and up to %d haplotypes)\n",
             em ? "E-M" : "MCMC", states);

   // Simple initial estimates of error and recombination rate
   for (int i = 0; i < reference.markerCount; i++)
      mp.E[i] = 0.01;

   for (int i = 0; i < reference.markerCount - 1; i++)
      mp.R[i] = 0.001;

   if (mp.ReadErrorRates(errorRates))
      printf("  Updated error rates using data in %s ...\n", (const char *) errorRates);

   if (mp.ReadCrossoverRates(recombinationRates))
      printf("  Updated recombination rates using %s ...\n", (const char *) recombinationRates);

   // Parameter estimation loop
   for (int round = 0; round < rounds; round++)
      {
      printf("  Round %d of Parameter Refinement ...\n", round + 1);

      int iterations = states < reference.count ? states : reference.count;

      MarkovModel original;
      original.CopyParameters(mp);

      #pragma omp parallel for
      for (int i = 0; i < iterations; i++)
         {
         MarkovModel mm;

         mm.Allocate(reference.markerCount, reference.count - 1);
         mm.CopyParameters(original);

         // Reference leave one out (loo) panel
         char ** reference_loo = new char * [reference.count - 1];
         for (int in = 0, out = 0; in < reference.count; in++)
            if (in != i)
               reference_loo[out++] = reference.haplotypes[in];

         mm.WalkLeft(reference.haplotypes[i], reference_loo, reference.freq);

         if (em)
            mm.CountExpected(reference.haplotypes[i], reference_loo, reference.freq);
         else
            {
            #pragma omp critical
            { mm.ProfileModel(reference.haplotypes[i], reference_loo, reference.freq); }
            }

         delete [] reference_loo;

         #pragma omp critical
         mp += mm;
         }

      if (round >= rounds / 2)
         {
         int iterations = states < target.count ? states : target.count;

         #pragma omp parallel for
         for (int i = 0; i < iterations; i++)
            {
            MarkovModel mm;

            mm.Allocate(reference.markerCount, reference.count);
            mm.CopyParameters(original);

            // Padded version of target haplotype, including missing sites
            char * padded = new char [reference.markerCount];
            for (int k = 0; k < reference.markerCount; k++)
               padded[k] = 0;

            // Copy current haplotype into padded vector
            for (int j = 0; j < target.markerCount; j++)
               if (markerIndex[j] >= 0)
                  padded[markerIndex[j]] = target.haplotypes[i][j];

            mm.WalkLeft(padded, reference.haplotypes, reference.freq);

            if (em)
               mm.CountExpected(padded, reference.haplotypes, reference.freq);
            else
               {
               #pragma omp critical
               { mm.ProfileModel(padded, reference.haplotypes, reference.freq); }
               }

            delete [] padded;

            #pragma omp critical
            mp += mm;
            }
         }

      mp.UpdateModel();

      double crossovers = 0;
      for (int i = 0; i < reference.markerCount - 1; i++)
         crossovers += mp.R[i];

      double errors = 0;
      for (int i = 0; i < reference.markerCount; i++)
         {
         double heterozygosity = 1.0 - square(reference.freq[1][i])
                                     - square(reference.freq[2][i])
                                     - square(reference.freq[3][i])
                                     - square(reference.freq[4][i]);

         errors += mp.E[i] * heterozygosity;
         }
      errors /= reference.markerCount + 1e-30;

      printf("      %.0f mosaic crossovers expected per haplotype\n", crossovers);
      printf("      %.1f%% of crossovers are due to reference flips\n", mp.empiricalFlipRate * 100.);
      printf("      %.3g errors in mosaic expected per marker\n", errors);
      }

   if (rounds > 0)
      {
      printf("  Saving estimated parameters for future use ...\n");
      mp.WriteParameters(refMarkerList, prefix, gzip);
      }

   printf("\n");

   // List the major allele at each location
   reference.ListMajorAlleles();

   printf("Generating Draft .info File ...\n\n");

   // Output some basic information
   IFILE info = ifopen(prefix + ".info.draft", "wt");

   ifprintf(info, "SNP\tAl1\tAl2\tFreq1\tGenotyped\n");

   for (int i = 0, j = 0; i <= stopIndex; i++)
      if (i >= startIndex)
         ifprintf(info, "%s\t%s\t%s\t%.4f\t%s\n",
            (const char *) refMarkerList[i],
            reference.MajorAlleleLabel(i), reference.MinorAlleleLabel(i),
            reference.freq[reference.major[i]][i],
            j < markerIndex.Length() && i == markerIndex[j] ? (j++, "Genotyped") : "-");
      else
         if (j < markerIndex.Length() && i == markerIndex[j])
            j++;

   ifclose(info);

   printf("Imputing Genotypes ...\n");

   IFILE dosages = ifopen(prefix + ".dose" + (gzip ? ".gz" : ""), "wt");
   IFILE hapdose, haps;

   if (phased)
      {
      hapdose = ifopen(prefix + ".hapDose" + (gzip ? ".gz" : ""), "wt");
      haps = ifopen(prefix + ".haps" + (gzip ? ".gz" : ""), "wt");
      }

   ImputationStatistics stats(reference.markerCount);

   // Impute each haplotype
   #pragma omp parallel for
   for (int i = 0; i < target.count; i++)
      {
      if (i != 0 && target.labels[i] == target.labels[i-1])
         continue;

      MarkovModel mm;

      mm.Allocate(reference.markerCount, reference.count);
      mm.ClearImputedDose();
      mm.CopyParameters(mp);

      // Padded version of target haplotype, including missing sites
      char * padded = new char [reference.markerCount];
      for (int j = 0; j < reference.markerCount; j++)
         padded[j] = 0;

      int k = i;

      do {
         printf("  Processing Haplotype %d of %d ...\n", k + 1, target.count);

         // Copy current haplotype into padded vector
         for (int j = 0; j < target.markerCount; j++)
            if (markerIndex[j] >= 0)
               padded[markerIndex[j]] = target.haplotypes[k][j];

         mm.WalkLeft(padded, reference.haplotypes, reference.freq);
         mm.Impute(reference.major, padded, reference.haplotypes, reference.freq);

         #pragma omp critical
         { stats.Update(mm.imputedHap, mm.leaveOneOut, padded, reference.major); }

         #pragma omp critical
         if (phased)
            {
            ifprintf(hapdose, "%s\tHAPLO%d", (const char *) target.labels[i], k - i + 1);
            ifprintf(haps, "%s\tHAPLO%d", (const char *) target.labels[i], k - i + 1);
            for (int j = startIndex; j <= stopIndex; j++)
               {
               ifprintf(hapdose, "\t%.3f", mm.imputedHap[j]);
               ifprintf(haps, "%s%c", j % 8 == 0 ? " " : "", mm.imputedAlleles[j]);
               }
            ifprintf(hapdose, "\n");
            ifprintf(haps, "\n");
            }

         k++;
      } while (k < target.count && target.labels[k] == target.labels[i]);

      printf("    Outputting Individual %s ...\n", (const char *) target.labels[i]);

      #pragma omp critical
         {
         ifprintf(dosages, "%s\tDOSE", (const char *) target.labels[i]);
         for (int j = startIndex; j <= stopIndex; j++)
            ifprintf(dosages, "\t%.3f", mm.imputedDose[j]);
         ifprintf(dosages, "\n");
         }

      delete [] padded;
      }

   ifclose(dosages);

   if (phased)
      {
      ifclose(hapdose);
      ifclose(haps);
      }

   // Output some basic information
   info = ifopen(prefix + ".info" + (gzip ? ".gz" : ""), "wt");

   ifprintf(info, "SNP\tAl1\tAl2\tFreq1\tMAF\tAvgCall\tRsq\tGenotyped\tLooRsq\tEmpR\tEmpRsq\tDose1\tDose2\n");

   // Padded version of target haplotype, including missing sites
   char * padded = new char [reference.markerCount];
   for (int k = 0; k < reference.markerCount; k++)
      padded[k] = 0;

   // Mark genotyped SNPs in padded vector
   for (int j = 0; j < target.markerCount; j++)
      if (markerIndex[j] >= 0)
          padded[markerIndex[j]] = 1;

   for (int i = startIndex; i <= stopIndex; i++)
      {
      ifprintf(info, "%s\t%s\t%s\t%.5f\t%.5f\t%.5f\t%.5f\t",
            (const char *) refMarkerList[i],
            reference.MajorAlleleLabel(i),
            reference.MinorAlleleLabel(i),
            stats.AlleleFrequency(i),
            stats.AlleleFrequency(i) > 0.5 ? 1.0 - stats.AlleleFrequency(i) : stats.AlleleFrequency(i),
            stats.AverageCallScore(i),
            stats.Rsq(i));

      if (padded[i])
         ifprintf(info, "Genotyped\t%.5f\t%.5f\t%.5f\t%.5f\t%.5f\n",
                  stats.LooRsq(i), stats.EmpiricalR(i), stats.EmpiricalRsq(i),
                  stats.LooMajorDose(i), stats.LooMinorDose(i));
      else
         ifprintf(info, "-\t-\t-\t-\t-\t-\n");
      }

   ifclose(info);

   delete [] padded;

   time_t stop = time(NULL);
   int seconds = stop - start;

   printf("\nRun completed in %d hours, %d mins, %d seconds on %s\n\n",
          seconds / 3600, (seconds % 3600) / 60, seconds % 60,
          ctime(&stop));
   }
コード例 #18
0
ファイル: VcfFile.cpp プロジェクト: amarawi/gotcloud
void BedFile::openForRead(const char* bedFile, const char* bimFile, const char* famFile, const char* refFile, int nbuf) {
  StringArray tokens;

  reset();

  iFile = ifopen(bedFile,"rb");
  if ( iFile == NULL ) {
    throw VcfFileException("Failed opening file %s - %s",bedFile,strerror(errno));
  }
  
  // read magic numbers
  char magicNumbers[3] = {0x6c,0x1b,0x01};
  char firstThreeBytes[3];
  ifread( iFile, firstThreeBytes, 3 );
  for(int i=0; i < 3; ++i) {
    if ( firstThreeBytes[i] != magicNumbers[i] ) {
      throw VcfFileException("The magic numbers do not match in BED file %s",bedFile);
    }
  }

  iBimFile = ifopen(bimFile,"rb");
  iFamFile = ifopen(famFile,"rb");
  sRefFile = refFile;

  while( 1 ) {
    int ret = line.ReadLine(iFamFile);
    if ( ret <= 0 ) break;
    tokens.ReplaceTokens(line, " \t\r\n");
    if ( tokens.Length() < 5 ) {
      throw VcfFileException("Less then 5 columns are observed in FAM file");
    }
    VcfInd* p = new VcfInd(tokens[1],tokens[0],tokens[2],tokens[3],tokens[4]);
    vpVcfInds.push_back(p);
  }

  //Logger::gLogger->writeLog("Finished loading %d individuals from FAM file",(int)vpVcfInds.size());

  nBytes = (vpVcfInds.size()+3)/4;
  if ( pBedBuffer != NULL ) { delete[] pBedBuffer; }
  pBedBuffer = new char[nBytes];

  nBuffers = nbuf;
  nNumMarkers = 0;
  nHead = 0;

  bParseGenotypes = true;
  bParseDosages = false;
  bParseValues = false;

  if ( nBuffers == 0 ) { // infinite buffer size
    // do not set size of markers
  }
  else {
    vpVcfMarkers.resize( nBuffers );
    for(int i=0; i < nBuffers; ++i) {
      VcfMarker* p = new VcfMarker;
      vpVcfMarkers[i] = p;
    }
  }

  genomeSequence.setReferenceName(sRefFile.c_str());
  genomeSequence.useMemoryMap(true);

  //Logger::gLogger->writeLog("Loading reference file %s",sRefFile.c_str());

  if ( genomeSequence.open() ) {
    // write a message that new index file is being created
    if ( genomeSequence.create(false) ) {
      throw VcfFileException("Failed creating index file of the reference. Please check the file permission");
    }
    if ( genomeSequence.open() ) {
      throw VcfFileException("Failed opening index file of the reference.");
    }
  }
}
コード例 #19
0
ファイル: Stats.cpp プロジェクト: BioScripts/bamUtil
int Stats::execute(int argc, char **argv)
{
    // Extract command line arguments.
    String inFile = "";
    String indexFile = "";
    bool basic = false;
    bool noeof = false;
    bool params = false;
    bool qual = false;
    bool phred = false;
    int maxNumReads = -1;
    bool unmapped = false;
    String pBaseQC = "";
    String cBaseQC = "";
    String regionList = "";
    int excludeFlags = 0;
    int requiredFlags = 0;
    bool withinRegion = false;
    int minMapQual = 0;
    String dbsnp = "";
    PosList *dbsnpListPtr = NULL;
    bool baseSum = false;
    int bufferSize = PileupHelper::DEFAULT_WINDOW_SIZE;

    ParameterList inputParameters;
    BEGIN_LONG_PARAMETERS(longParameterList)
        LONG_PARAMETER_GROUP("Required Parameters")
        LONG_STRINGPARAMETER("in", &inFile)
        LONG_PARAMETER_GROUP("Types of Statistics")
        LONG_PARAMETER("basic", &basic)
        LONG_PARAMETER("qual", &qual)
        LONG_PARAMETER("phred", &phred)
        LONG_STRINGPARAMETER("pBaseQC", &pBaseQC)
        LONG_STRINGPARAMETER("cBaseQC", &cBaseQC)
        LONG_PARAMETER_GROUP("Optional Parameters")
        LONG_INTPARAMETER("maxNumReads", &maxNumReads)
        LONG_PARAMETER("unmapped", &unmapped)
        LONG_STRINGPARAMETER("bamIndex", &indexFile)
        LONG_STRINGPARAMETER("regionList", &regionList)
        LONG_INTPARAMETER("excludeFlags", &excludeFlags)
        LONG_INTPARAMETER("requiredFlags", &requiredFlags)
        LONG_PARAMETER("noeof", &noeof)
        LONG_PARAMETER("params", &params)
        LONG_PARAMETER_GROUP("Optional phred/qual Only Parameters")
        LONG_PARAMETER("withinRegion", &withinRegion)
        LONG_PARAMETER_GROUP("Optional BaseQC Only Parameters")
        LONG_PARAMETER("baseSum", &baseSum)
        LONG_INTPARAMETER("bufferSize", &bufferSize)
        LONG_INTPARAMETER("minMapQual", &minMapQual)
        LONG_STRINGPARAMETER("dbsnp", &dbsnp)
        END_LONG_PARAMETERS();
   
    inputParameters.Add(new LongParameters ("Input Parameters", 
                                            longParameterList));

    inputParameters.Read(argc-1, &(argv[1]));

    // If no eof block is required for a bgzf file, set the bgzf file type to 
    // not look for it.
    if(noeof)
    {
        // Set that the eof block is not required.
        BgzfFileType::setRequireEofBlock(false);
    }

    // Check to see if the in file was specified, if not, report an error.
    if(inFile == "")
    {
        usage();
        inputParameters.Status();
        // In file was not specified but it is mandatory.
        std::cerr << "--in is a mandatory argument for stats, "
                  << "but was not specified" << std::endl;
        return(-1);
    }

    // Use the index file if unmapped or regionList is not empty.
    bool useIndex = (unmapped|| (!regionList.IsEmpty()));

    // IndexFile is required, so check to see if it has been set.
    if(useIndex && (indexFile == ""))
    {
        // In file was not specified, so set it to the in file
        // + ".bai"
        indexFile = inFile + ".bai";
    }
    ////////////////////////////////////////
    // Setup in case pileup is used.
    Pileup<PileupElementBaseQCStats> pileup(bufferSize);
    // Initialize start/end positions.
    myStartPos = 0;
    myEndPos = -1;
    
    // Open the output qc file if applicable.
    IFILE baseQCPtr = NULL;
    if(!pBaseQC.IsEmpty() && !cBaseQC.IsEmpty())
    {
        usage();
        inputParameters.Status();
        // Cannot specify both types of baseQC.
        std::cerr << "Cannot specify both --pBaseQC & --cBaseQC." << std::endl;
        return(-1);
    }
    else if(!pBaseQC.IsEmpty())
    {
        baseQCPtr = ifopen(pBaseQC, "w");
        PileupElementBaseQCStats::setPercentStats(true);
    }
    else if(!cBaseQC.IsEmpty())
    {
        baseQCPtr = ifopen(cBaseQC, "w");
        PileupElementBaseQCStats::setPercentStats(false);
    }

    if(baseQCPtr != NULL)
    {
        PileupElementBaseQCStats::setOutputFile(baseQCPtr);
        PileupElementBaseQCStats::printHeader();
    }
    if((baseQCPtr != NULL) || baseSum)
    {
        PileupElementBaseQCStats::setMapQualFilter(minMapQual);
        PileupElementBaseQCStats::setBaseSum(baseSum);
    }

    if(params)
    {
        inputParameters.Status();
    }

    // Open the file for reading.
    SamFile samIn;
    if(!samIn.OpenForRead(inFile))
    {
        fprintf(stderr, "%s\n", samIn.GetStatusMessage());
        return(samIn.GetStatus());
    }

    samIn.SetReadFlags(requiredFlags, excludeFlags);

    // Set whether or not basic statistics should be generated.
    samIn.GenerateStatistics(basic);

    // Read the sam header.
    SamFileHeader samHeader;
    if(!samIn.ReadHeader(samHeader))
    {
        fprintf(stderr, "%s\n", samIn.GetStatusMessage());
        return(samIn.GetStatus());
    }

    // Open the bam index file for reading if we are
    // doing unmapped reads (also set the read section).
    if(useIndex)
    {
        samIn.ReadBamIndex(indexFile);

        if(unmapped)
        {
            samIn.SetReadSection(-1);
        }

        if(!regionList.IsEmpty())
        {
            myRegionList = ifopen(regionList, "r");
        }
    }

    //////////////////////////
    // Read dbsnp if specified and doing baseQC
    if(((baseQCPtr != NULL) || baseSum) && (!dbsnp.IsEmpty()))
    {
        // Read the dbsnp file.
        IFILE fdbSnp;
        fdbSnp = ifopen(dbsnp,"r");
        // Determine how many entries.
        const SamReferenceInfo& refInfo = samHeader.getReferenceInfo();
        int maxRefLen = 0;
        for(int i = 0; i < refInfo.getNumEntries(); i++)
        {
            int refLen = refInfo.getReferenceLength(i);
            if(refLen >= maxRefLen)
            {
                maxRefLen = refLen + 1;
            }
        }
        
        dbsnpListPtr = new PosList(refInfo.getNumEntries(),maxRefLen);

        if(fdbSnp==NULL)
        {
            std::cerr << "Open dbSNP file " << dbsnp.c_str() << " failed!\n";
        }
        else if(dbsnpListPtr == NULL)
        {
            std::cerr << "Failed to init the memory allocation for the dbsnpList.\n";
        }
        else
        {
            // Read the dbsnp file.
            StringArray tokens;
            String buffer;
            int position = 0;
            int refID = 0;

            // Loop til the end of the file.
            while (!ifeof(fdbSnp))
            {
                // Read the next line.
                buffer.ReadLine(fdbSnp);
                // If it does not have at least 2 columns, 
                // continue to the next line.
                if (buffer.IsEmpty() || buffer[0] == '#') continue;
                tokens.AddTokens(buffer);
                if(tokens.Length() < 2) continue;

                if(!tokens[1].AsInteger(position))
                {
                    std::cerr << "Improperly formatted region line, start position "
                              << "(2nd column) is not an integer: "
                              << tokens[1]
                              << "; Skipping to the next line.\n";         
                    continue;
                }

                // Look up the reference name.
                refID = samHeader.getReferenceID(tokens[0]);
                if(refID != SamReferenceInfo::NO_REF_ID)
                {
                    // Reference id was found, so add it to the dbsnp
                    dbsnpListPtr->addPosition(refID, position);
                }
        
                tokens.Clear();
                buffer.Clear();
            }
        }
        ifclose(fdbSnp);
    }

    // Read the sam records.
    SamRecord samRecord;

    int numReads = 0;

    //////////////////////
    // Setup in case doing a quality count.
    // Quality histogram.
    const int MAX_QUAL = 126;
    const int START_QUAL = 33;
    uint64_t qualCount[MAX_QUAL+1];
    for(int i = 0; i <= MAX_QUAL; i++)
    {
        qualCount[i] = 0;
    }
    
    const int START_PHRED = 0;
    const int PHRED_DIFF = START_QUAL - START_PHRED;
    const int MAX_PHRED = MAX_QUAL - PHRED_DIFF;
    uint64_t phredCount[MAX_PHRED+1];
    for(int i = 0; i <= MAX_PHRED; i++)
    {
        phredCount[i] = 0;
    }
    
    int refPos = 0;
    Cigar* cigarPtr = NULL;
    char cigarChar = '?';
    // Exclude clips from the qual/phred counts if unmapped reads are excluded.
    bool qualExcludeClips = excludeFlags & SamFlag::UNMAPPED;

    //////////////////////////////////
    // When not reading by sections, getNextSection returns true
    // the first time, then false the next time.
    while(getNextSection(samIn))
    {
        // Keep reading records from the file until SamFile::ReadRecord
        // indicates to stop (returns false).
        while(((maxNumReads < 0) || (numReads < maxNumReads)) && samIn.ReadRecord(samHeader, samRecord))
        {
            // Another record was read, so increment the number of reads.
            ++numReads;
            // See if the quality histogram should be genereated.
            if(qual || phred)
            {
                // Get the quality.
                const char* qual = samRecord.getQuality();
                // Check for no quality ('*').
                if((qual[0] == '*') && (qual[1] == 0))
                {
                    // This record does not have a quality string, so no 
                    // quality processing is necessary.
                }
                else
                {
                    int index = 0;
                    cigarPtr = samRecord.getCigarInfo();
                    cigarChar = '?';
                    refPos = samRecord.get0BasedPosition();
                    if(!qualExcludeClips && (cigarPtr != NULL))
                    {
                        // Offset the reference position by any soft clips
                        // by subtracting the queryIndex of this start position.
                        // refPos is now the start position of the clips.
                        refPos -= cigarPtr->getQueryIndex(0);
                    }

                    while(qual[index] != 0)
                    {
                        // Skip this quality if it is clipped and we are skipping clips.
                        if(cigarPtr != NULL)
                        {
                            cigarChar = cigarPtr->getCigarCharOpFromQueryIndex(index);
                        }
                        if(qualExcludeClips && Cigar::isClip(cigarChar))
                        {
                            // Skip a clipped quality.
                            ++index;
                            // Increment the position.
                            continue;
                        }

                        if(withinRegion && (myEndPos != -1) && (refPos >= myEndPos))
                        {
                            // We have hit the end of the region, stop processing this
                            // quality string.
                            break;
                        }

                        if(withinRegion && (refPos < myStartPos))
                        {
                            // This position is not in the target.
                            ++index;
                            // Update the position if this is found in the reference or a clip.
                            if(Cigar::foundInReference(cigarChar) || Cigar::isClip(cigarChar))
                            {
                                ++refPos;
                            }
                            continue;
                        }

                        // Check for valid quality.
                        if((qual[index] < START_QUAL) || (qual[index] > MAX_QUAL))
                        {
                            if(qual)
                            {
                                std::cerr << "Invalid Quality found: " << qual[index] 
                                          << ".  Must be between "
                                          << START_QUAL << " and " << MAX_QUAL << ".\n";
                            }
                            if(phred)
                            {
                                std::cerr << "Invalid Phred Quality found: " << qual[index] - PHRED_DIFF
                                          << ".  Must be between "
                                          << START_QUAL << " and " << MAX_QUAL << ".\n";
                            }
                            // Skip an invalid quality.
                            ++index;
                            // Update the position if this is found in the reference or a clip.
                            if(Cigar::foundInReference(cigarChar) || Cigar::isClip(cigarChar))
                            {
                                ++refPos;
                            }
                            continue;
                        }
                        
                        // Increment the count for this quality.
                        ++(qualCount[(int)(qual[index])]);
                        ++(phredCount[(int)(qual[index]) - PHRED_DIFF]);
                        // Update the position if this is found in the reference or a clip.
                        if(Cigar::foundInReference(cigarChar) || Cigar::isClip(cigarChar))
                        {
                            ++refPos;
                        }
                        ++index;
                    }
                }
            }

            // Check the next thing to do for the read.
            if((baseQCPtr != NULL) || baseSum)
            {
                // Pileup the bases for this read.
                pileup.processAlignmentRegion(samRecord, myStartPos, myEndPos, dbsnpListPtr);
            }
        }

        // Done with a section, move on to the next one.

        // New section, so flush the pileup.
        pileup.flushPileup();
    }

    // Flush the rest of the pileup.
    if((baseQCPtr != NULL) || baseSum)
    {
        // Pileup the bases.
        pileup.processAlignmentRegion(samRecord, myStartPos, myEndPos, dbsnpListPtr);
        PileupElementBaseQCStats::printSummary();
        ifclose(baseQCPtr);
    }

    std::cerr << "Number of records read = " << 
        samIn.GetCurrentRecordCount() << std::endl;

    if(basic)
    {
        std::cerr << std::endl;
        samIn.PrintStatistics();
    }

    // Print the quality stats.
    if(qual)
    {
        std::cerr << std::endl;
        std::cerr << "Quality\tCount\n";
        for(int i = START_QUAL; i <= MAX_QUAL; i++)
        {
            std::cerr << i << "\t" << qualCount[i] << std::endl;
        }
    }
    // Print the phred quality stats.
    if(phred)
    {
        std::cerr << std::endl;
        std::cerr << "Phred\tCount\n";
        for(int i = START_PHRED; i <= MAX_PHRED; i++)
        {
            std::cerr << i << "\t" << phredCount[i] << std::endl;
        }
    }

    SamStatus::Status status = samIn.GetStatus();
    if(status == SamStatus::NO_MORE_RECS)
    {
        // A status of NO_MORE_RECS means that all reads were successful.
        status = SamStatus::SUCCESS;
    }

    return(status);
}
コード例 #20
0
void ClipReference(HaplotypeSet  & reference,
                   StringArray   & refMarkerList,
                   StringIntHash & referenceHash,
                   StringArray   & markerList,
                   String & start, String & stop)
   {
   if (start == "start") start.Clear();
   if (stop == "stop") stop.Clear();

   // If no clipping was requested, then nothing to do
   if (start.IsEmpty() && stop.IsEmpty())
       return;

   // Find the stretch of target that overlaps with reference
   int    firstMatch = reference.markerCount, lastMatch = -1;
   bool   matchStart = false, matchStop = false;
   String newStart, newStop;

   // First we find overlapping markers in target and, at the same time,
   // keep track of the marker nearest suggested start and stop positions
   // that overlaps with reference.
   for (int i = 0; i < markerList.Length(); i++)
      {
      String trimmed = markerList[i].Trim();

      if (start == trimmed) matchStart = true;
      if (stop == trimmed) matchStop = true;

      int index = referenceHash.Integer(trimmed);

      if (index < 0) continue;

      if (index < firstMatch) firstMatch = index;
      if (index > lastMatch) lastMatch = index;

      if (matchStart)
         {
         newStart = trimmed;
         matchStart = false;
         }

      if (matchStop)
         {
         newStop = trimmed;
         matchStop = false;
         }
      }

   // If start and stop are not in the reference, adjust them
   // according to information in the target list
   int startIndex = referenceHash.Integer(start);
   int stopIndex = referenceHash.Integer(stop);

   if (startIndex < 0 && !start.IsEmpty())
      {
      if (newStart.IsEmpty()) return;

      start = newStart;
      startIndex = referenceHash.Integer(start);
      }
   firstMatch = firstMatch < startIndex ? firstMatch : startIndex;

   if (stopIndex < 0 && !stop.IsEmpty())
      {
      if (newStop.IsEmpty()) return;

      stop = newStop;
      stopIndex = referenceHash.Integer(stop);
      }
   lastMatch = lastMatch > stopIndex ? lastMatch : stopIndex;

   int clipFrom = !start.IsEmpty() ? firstMatch : 0;
   int clipTo = !stop.IsEmpty() ? lastMatch : reference.markerCount - 1;

   if (clipFrom > 0 || clipTo < reference.markerCount - 1)
      {
      printf("  Clipping reference haplotypes to match target ...\n");

      reference.ClipHaplotypes(clipFrom, clipTo);

      StringArray newMarkerList;
      newMarkerList.Dimension(reference.markerCount);
      for (int i = clipFrom; i <= clipTo; i++)
         newMarkerList[i - clipFrom].Swap(refMarkerList[i]);
      newMarkerList.Swap(refMarkerList);

      referenceHash.Clear();
      for (int i = 0; i < refMarkerList.Length(); i++)
         referenceHash.Add(refMarkerList[i].Trim(), i);

      printf("    %d Markers Remain After Clipping ...\n", reference.markerCount);
      }
   }
コード例 #21
0
bool RegressionAnalysis::ReadModelsFromFile()
   {
   StringArray models;
   models.Read(modelsFile);

   if (models.Length() == 0)
      return false;

   regress = new FancyRegression[models.Length()];

   printf("Retrieving analysis models from file [%s]...\n",
          (const char *) modelsFile);

   modelCount = 0;

   StringArray tokens;
   for (int i = 0, line = 0; i < models.Length(); i++)
      {
      models[i].Trim();

      // Skip comments
      if (models[i][0] == '#') continue;

      // Divide each line into tokens
      tokens.Clear();
      tokens.AddTokens(models[i]);

      // Skip blank lines
      if (tokens.Length() == 0) continue;

      // Print message for tracing...
      printf("   Input: %s\n", (const char *) models[i], line++);

      // Need a minimum of four tokens per line
      if (tokens.Length() < 4)
         {
         printf(" Skipped: Trait name, mean, variance and heritability required.\n");
         continue;
         }

      regress[modelCount].trait = ped.LookupTrait(tokens[0]);

      if (regress[modelCount].trait < 0)
         {
         printf(line == 1 ? " Skipped: Appears to be a header line\n" :
                            " Skipped: Trait %s not listed in the data file\n",
                            (const char *) tokens[0]);
         continue;
         }

      // First check that mean, variance and heritability are valid numbers
      bool fail = false;
      for (int j = 1; j <= 3; j++)
         {
         char * ptr = NULL;
         strtod(tokens[j], &ptr);
         fail |= ptr[0] != 0;
         }

      // If one of the values is not a valid number, skip
      if (fail)
         {
         printf(line == 1 ? " Skipped: Appears to be a header line\n" :
                            " Skipped: Invalid numeric format\n");
         continue;
         }

      regress[modelCount].mean = tokens[1];
      regress[modelCount].variance = tokens[2];
      regress[modelCount].heritability = tokens[3];

      if (tokens.Length() > 4)
         {
         regress[modelCount].label = tokens[4];

         for (int j = 5; j < tokens.Length(); j++)
            {
            regress[modelCount].label += " ";
            regress[modelCount].label += tokens[j];
            }
         }
      else
         regress[modelCount].label.printf("Model %d", modelCount + 1);

      regress[modelCount].shortLabel = regress[modelCount].label;
      regress[modelCount].testRetestCorrel = testRetestCorrel;
      regress[modelCount].bounded = !unrestricted;

      printf("        Model loaded and labelled %s\n", (const char *) regress[modelCount].label);

      modelCount++;
      }

   if (modelCount == 0)
      {
      printf("No valid models, default model will be used\n\n");
      return false;
      }

   printf("Table processed. %d models recognized\n\n", modelCount);

   return true;
   }
コード例 #22
0
void GroupFromAnnotation::vcfInitialize()
{
	// func_upper
	if ( function != "" ) {
		func_upper.AddTokens( function, "/" );
		for( int i=0; i<func_upper.Length(); i++ )
			func_upper[i] = func_upper[i].ToUpper();
	}

	FILE * inFile;
	inFile = fopen(vcfInput,"r");
	while (!feof(inFile)) {
		String buffer;
		buffer.ReadLine( inFile);
		if ( buffer[0] == '#' )
			continue;
		StringArray vfield;
		vfield.AddTokens(buffer, "\t");
		if ( vfield.Length() < 8 )
			error("Annotation vcf only has %d columns!\n", vfield.Length());
		StringArray info_semicolon;
		info_semicolon.AddTokens( vfield[7],";" );
		
		int annofull_index = -1;
		for( int i=0; i<info_semicolon.Length(); i++ ) {
			String iheader = info_semicolon[i].SubStr(0,8);
			if (iheader == "ANNOFULL") {
				annofull_index = i;
				break;
			}
		}
		if (annofull_index == -1)
			continue;
		String anno_full_str = info_semicolon[annofull_index].SubStr(9);
		StringArray alts;
		alts.AddTokens( anno_full_str, "|" );
		for( int a=0; a<alts.Length(); a++ ) {
			StringArray sub;
			sub.AddTokens( alts[a], ":/=");
			if (func_upper.Length() != 0) { // match before add
				for(int f =0;f<func_upper.Length();f++) {
					bool pattern_match = checkPatternMatch( sub, func_upper[f] );
					if ( pattern_match ) {
						chrom.Push( vfield[0] );
						addGeneToGroupHash( sub[0] );
						break;
					}
				}
			}
			else { // no pattern to match
				chrom.Push( vfield[0] );
				addGeneToGroupHash( sub[0] );		
			}
		}
	}

// vectors	
	SNPlist = new StringArray [geneCount];
	SNPNoAllele = new StringArray [geneCount];
	pos = new Vector [geneCount];
	
}