Esempio n. 1
0
/* Fetch the product ID from the given device path */
static inline uint16_t pidFromDevPath(std::string const& devpath)
{
	try
	{
		auto t1 = searchOne(devpath, "VID_.*PID_([0-9a-fA-F]{4})");
		if (t1.empty())	// prevent unnecessary exceptions
			return 0;
		return static_cast<uint16_t>(std::stoi(t1, nullptr, 16));
	}
	catch (std::invalid_argument &)
	{
		return 0;
	}
}
void searchOneMaskTrim(struct dnaSeq *seq, boolean isProt,
		       struct genoFind *gf, FILE *outFile,
		       struct hash *maskHash,
		       long long *retTotalSize, int *retCount)
/* Search a single sequence against a single genoFind index. */
{
boolean maskQuery = (qMask != NULL);
boolean lcMask = (qMask != NULL && sameWord(qMask, "lower"));
Bits *qMaskBits = maskQuerySeq(seq, isProt, maskQuery, lcMask);
struct dnaSeq trimmedSeq;
ZeroVar(&trimmedSeq);
trimSeq(seq, &trimmedSeq);
if (qType == gftRna || qType == gftRnaX)
   memSwapChar(trimmedSeq.dna, trimmedSeq.size, 'u', 't');
searchOne(&trimmedSeq, gf, outFile, isProt, maskHash, qMaskBits);
*retTotalSize += seq->size;
*retCount += 1;
bitFree(&qMaskBits);
}
// search - runs the searches
void SpectraSTMzXMLSearchTask::search() {
	
  if (!m_params.indexCacheAll) {
    
    // Not caching all entries. In this case, the queries have to be sorted by precursor m/z first, such that
    // the cached window slides from low to high precursor m/z only once. (Otherwise, the cached entries will need to be swapped 
    // in and out repeatedly, defeating the purpose of caching.)
    
    // There is a catch however. To be able to search out of order, one has to keep many mzXML files open, and most systems
    // have a max file opened limit. In such case, we will need to divide the mzXML files into smaller batches. The library will
    // will need to be read numBatches times, but the tradeoff is we won't need to keep all entries cached in memory by selecting
    // the indexCacheAll option.
    
    // Divide the mzXML files into equal batches of at most MAX_NUM_OPEN_FILES files. 
    unsigned int numBatches = ((unsigned int)m_searchFileNames.size() - 1) / MAX_NUM_OPEN_FILES + 1;
    unsigned int batchStart = 0;
    for (unsigned int b = 0; b < numBatches; b++) {
      m_batchBoundaries.push_back(batchStart);
      batchStart += (unsigned int)m_searchFileNames.size() / numBatches;
    }
    m_batchBoundaries.push_back((unsigned int)m_searchFileNames.size());

    // For each batch, sort all spectra by precursor m/z, open the files, and set the search in motion
    for (unsigned int batch = 0; batch < (unsigned int)m_batchBoundaries.size() - 1; batch++) {

      // this will do the sorting. the vector m_scans will be populated with the sorted scans.
      prepareSortedSearch((unsigned int)batch);
      
      // open the output files and print the headers (e.g. the xml definitions, ms run info, etc)
      for (unsigned int n = m_batchBoundaries[batch]; n < m_batchBoundaries[batch + 1]; n++) {
        m_outputs[n]->openFile();
        m_outputs[n]->printHeader();
      }
      
      // tracking search progress
      ProgressCount pc(!g_quiet && !g_verbose, 1, (int)(m_scans.size()));
      string msg("Searching");
      pc.start(msg);
    
      // create searches from the m_scans one-by-one, and search them
      for (vector<pair<unsigned int, rampScanInfo*> >::iterator i = m_scans.begin(); i != m_scans.end(); i++) {
      
        searchOne((*i).first, (*i).second);
        pc.increment();	
      
        // done. we can delete the rampScanInfo object now.
        delete (*i).second;
      }	
      pc.done();
    
      // log the search of the batch
      stringstream searchLogss;
      searchLogss << "Searched sorted scans ";
      searchLogss << "(Max " << m_numScansInFile << " scans; " << m_numSearchedInFile << " searched, ";
      searchLogss << m_numLikelyGoodInFile << " likely good; ";
      if (m_numNotSelectedInFile > 0) {
        searchLogss << m_numNotSelectedInFile << " not selected; ";
      }
      searchLogss << m_numFailedFilterInFile << " failed filter; " << m_numMissingInFile << " missing; " << m_numMS1InFile << " MS1)";
      g_log->log("MZXML SEARCH", searchLogss.str());
    
      // done with this batch. close the files so that we can open more files in the next batch
      for (unsigned int n = m_batchBoundaries[batch]; n < m_batchBoundaries[batch + 1]; n++) {
        delete (m_files[n].second); // the cramp objects, which will close the mzXML files
        m_files[n].second = NULL;
        m_outputs[n]->printFooter(); // the output files
        m_outputs[n]->closeFile();
      }
    }
    
  } else {
    
    // This is the case where we're caching everything anyway. In this case, it is not necessary to sort
    // by precursor m/z before searching. We simply open the files one by one and search the queries
    // in the order they are read.
    
    for (unsigned int n = 0; n < (unsigned int)m_searchFileNames.size(); n++) {
      
      
      // open the file using cRamp
      cRamp* cramp = new cRamp(m_searchFileNames[n].c_str());
      
      if (!cramp->OK()) {
        g_log->error("MZXML SEARCH", "Cannot open file \"" + m_searchFileNames[n] + "\". File skipped.");
        delete (cramp);
        continue;
      }
      
      // Read the run info to extract the number of scans
      rampRunInfo* runInfo = cramp->getRunInfo();
      
      if (!runInfo) {
        // probably an empty file...
        g_log->error("MZXML SEARCH", "Cannot open file \"" + m_searchFileNames[n] + "\". File skipped.");
        delete (cramp);
        continue;
      }
      
      rampInstrumentInfo* instr = cramp->getInstrumentInfo();
      if (instr) {
        m_outputs[n]->setInstrInfo(instr);
//        delete (instr);
      }
      
      // open the output file
      m_outputs[n]->openFile();
      m_outputs[n]->printHeader();
      
      int numScans = cramp->getLastScan();
      
      delete (runInfo);
      
      // parse out the file name to determine the query prefix. Note that the query string
      // has the form <mzXML file name>.<scan num>.<scan num>.0
      FileName fn;
      parseFileName(m_searchFileNames[n], fn);
      
      // m_files is a vector of (FileName, cRamp*)
      m_files[n].first = fn;			
      m_files[n].second = cramp;
      
      ProgressCount pc(!g_quiet && !g_verbose, 1, numScans);
      stringstream msg;
      msg << "Searching \"" << m_searchFileNames[n] << "\" " << "(" << n + 1 << " of " << m_searchFileNames.size() << ")";
      pc.start(msg.str());	
      
      m_numScansInFile = numScans;
      m_numNotSelectedInFile = 0;
      m_numMissingInFile = 0;
      m_numMS1InFile = 0;
      m_numFailedFilterInFile = 0;
      m_numSearchedInFile = 0;
      m_numLikelyGoodInFile = 0;

      
      for (int k = 1; k <= numScans; k++) {	
	
        pc.increment();

        // Filter out all scans not in selected list (in this case,
	// the selected list contains a list of scan numbers as strings
	if (!m_searchAll && !isInSelectedList(SpectraSTQuery::constructQueryName(fn.name, k, 0))) {
          m_numNotSelectedInFile++;
	  continue;	
	}
	// get the scan header (no peak list) first to check whether it's MS2. 
	// it'd be a waste of time if we read all scans, including MS1
	rampScanInfo* scanInfo = cramp->getScanHeaderInfo(k);			

	// check to make sure the scan is good, and is not MS1	
        if (!scanInfo || (!m_isMzData && scanInfo->m_data.acquisitionNum != k)) {
          m_numMissingInFile++;          
          
          if (scanInfo) delete (scanInfo);
          continue;
        }
	
        if (scanInfo->m_data.msLevel == 1) {
          m_numMS1InFile++;
          delete (scanInfo);
          continue;
        }
          
        // now we can search
        searchOne(n, scanInfo);
	// done, can delete scanInfo
	delete scanInfo;
	
      }		
      pc.done();
      

      
      // log the search of this file
      stringstream searchLogss;
      searchLogss << "Searched \"" << m_searchFileNames[n] + "\" ";
      searchLogss << "(Max " << m_numScansInFile << " scans; " << m_numSearchedInFile << " searched, ";
      searchLogss << m_numLikelyGoodInFile << " likely good; ";
      if (m_numNotSelectedInFile > 0) {
        searchLogss << m_numNotSelectedInFile << " not selected; ";
      }
      searchLogss << m_numFailedFilterInFile << " failed filter; " << m_numMissingInFile << " missing; " << m_numMS1InFile << " MS1)";
      g_log->log("MZXML SEARCH", searchLogss.str());
           
      // we can delete the cRamp object now that we're done with this file.
      // this is in contrast to the case where we're opening all the files at once for
      // sorting -- in that case the cRamp objects will be deleted at the end of all
      // searches
      delete (m_files[n].second);
      m_files[n].second = NULL;
    
      m_outputs[n]->printFooter();
      m_outputs[n]->closeFile(); // just so we won't hit the File Open limit if there are too many files
    }	
  }
  
  m_searchTaskStats.logStats();
 
  
}