コード例 #1
0
ファイル: sqio.c プロジェクト: obbila/CustomWise
/* Function: ReadMultipleRseqs()
 * 
 * Purpose:  Open a data file and
 *           parse it into an array of rseqs (raw, unaligned
 *           sequences).
 * 
 *           Caller is responsible for free'ing memory allocated
 *           to ret_rseqs, ret_weights, and ret_names.
 *           
 *           Weights are currently only supported for MSF format.
 *           Sequences read from all other formats will be assigned
 *           weights of 1.0. If the caller isn't interested in
 *           weights, it passes NULL as ret_weights.
 * 
 * Returns 1 on success. Returns 0 on failure and sets
 * squid_errno to indicate the cause.
 */
int
ReadMultipleRseqs(char              *seqfile,
		  int                fformat,
		  char            ***ret_rseqs,
		  SQINFO **ret_sqinfo,
		  int               *ret_num)
{
  SQINFO *sqinfo;               /* array of sequence optional info         */
  SQFILE *dbfp;                 /* open ptr for sequential access of file  */
  char  **rseqs;                /* sequence array                          */
  char  **aseqs;                /* aligned sequences, if file is aligned   */
  AINFO   ainfo;      /* alignment-associated information        */
  int     numalloced;           /* num of seqs currently alloced for       */
  int     idx;
  int     num;

  if (fformat == kSelex || fformat == kMSF || fformat == kClustal)
    {
      if (! ReadAlignment(seqfile, fformat, &aseqs, &ainfo)) return 0;
      if (! DealignAseqs(aseqs, ainfo.nseq, &rseqs))                return 0;

      /* copy the sqinfo array
       */
      num = ainfo.nseq;
      sqinfo= (SQINFO *) MallocOrDie (sizeof(SQINFO)*ainfo.nseq);
      for (idx = 0; idx < ainfo.nseq; idx++)
	SeqinfoCopy(&(sqinfo[idx]), &(ainfo.sqinfo[idx]));
      FreeAlignment(aseqs, &ainfo);
    }
  else
    {
				/* initial alloc */
      num        = 0;
      numalloced = 16;
      rseqs  = (char **) MallocOrDie (numalloced * sizeof(char *));
      sqinfo = (SQINFO *) MallocOrDie (numalloced * sizeof(SQINFO));
      if ((dbfp = SeqfileOpen(seqfile, fformat, NULL)) == NULL) return 0;      

      while (ReadSeq(dbfp, fformat, &rseqs[num], &(sqinfo[num])))
	{
	  num++;
	  if (num == numalloced) /* more seqs coming, alloc more room */
	    {
	      numalloced += 16;
	      rseqs  = (char **) ReallocOrDie (rseqs, numalloced*sizeof(char *));
	      sqinfo = (SQINFO *) ReallocOrDie (sqinfo, numalloced * sizeof(SQINFO));
	    }
	}
      SeqfileClose(dbfp);
    }

  *ret_rseqs  = rseqs;
  *ret_sqinfo = sqinfo;
  *ret_num    = num;
  return 1;
}
コード例 #2
0
ファイル: AlignmentReader.cpp プロジェクト: JohnNash/MOSAIK
	// deserializes each alignment and stores them in the supplied vector
	void CAlignmentReader::ReadAlignments(
		vector<Alignment>&   alignments, 
		const bool&          isLongRead, 
		const bool&          isPairedInSequencing, 
		const bool&          isResolvedAsPair, 
		const unsigned int&  readGroupCode,
		const int&           numMate1OriginalAlignments,
		const int&           numMate2OriginalAlignments,
		const int&           numMate1Hashes,
		const int&           numMate2Hashes,
		const bool&          hasCsString) {
		
		vector<Alignment>::iterator alIter;
		for(alIter = alignments.begin(); alIter != alignments.end(); ++alIter) {
			ReadAlignment(*alIter, isLongRead, isPairedInSequencing, isResolvedAsPair,
			    numMate1OriginalAlignments, numMate2OriginalAlignments, 
			    numMate1Hashes, numMate2Hashes, hasCsString);
			alIter->ReadGroupCode = readGroupCode;
		}
	}
コード例 #3
0
int ExtractGHKM::Main(int argc, char *argv[])
{
  // Process command-line options.
  Options options;
  ProcessOptions(argc, argv, options);

  // Open input files.
  InputFileStream targetStream(options.targetFile);
  InputFileStream sourceStream(options.sourceFile);
  InputFileStream alignmentStream(options.alignmentFile);

  // Open output files.
  OutputFileStream fwdExtractStream;
  OutputFileStream invExtractStream;
  std::ofstream glueGrammarStream;
  std::ofstream unknownWordStream;
  std::string fwdFileName = options.extractFile;
  std::string invFileName = options.extractFile + std::string(".inv");
  if (options.gzOutput) {
    fwdFileName += ".gz";
    invFileName += ".gz";
  }
  OpenOutputFileOrDie(fwdFileName, fwdExtractStream);
  OpenOutputFileOrDie(invFileName, invExtractStream);
  if (!options.glueGrammarFile.empty()) {
    OpenOutputFileOrDie(options.glueGrammarFile, glueGrammarStream);
  }
  if (!options.unknownWordFile.empty()) {
    OpenOutputFileOrDie(options.unknownWordFile, unknownWordStream);
  }

  // Target label sets for producing glue grammar.
  std::set<std::string> labelSet;
  std::map<std::string, int> topLabelSet;

  // Word count statistics for producing unknown word labels.
  std::map<std::string, int> wordCount;
  std::map<std::string, std::string> wordLabel;

  std::string targetLine;
  std::string sourceLine;
  std::string alignmentLine;
  XmlTreeParser xmlTreeParser(labelSet, topLabelSet);
  ScfgRuleWriter writer(fwdExtractStream, invExtractStream, options);
  size_t lineNum = options.sentenceOffset;
  while (true) {
    std::getline(targetStream, targetLine);
    std::getline(sourceStream, sourceLine);
    std::getline(alignmentStream, alignmentLine);

    if (targetStream.eof() && sourceStream.eof() && alignmentStream.eof()) {
      break;
    }

    if (targetStream.eof() || sourceStream.eof() || alignmentStream.eof()) {
      Error("Files must contain same number of lines");
    }

    ++lineNum;

    // Parse target tree.
    if (targetLine.size() == 0) {
      std::cerr << "skipping line " << lineNum << " with empty target tree\n";
      continue;
    }
    std::auto_ptr<ParseTree> t;
    try {
      t = xmlTreeParser.Parse(targetLine);
      assert(t.get());
    } catch (const Exception &e) {
      std::ostringstream s;
      s << "Failed to parse XML tree at line " << lineNum;
      if (!e.GetMsg().empty()) {
        s << ": " << e.GetMsg();
      }
      Error(s.str());
    }

    // Read source tokens.
    std::vector<std::string> sourceTokens(ReadTokens(sourceLine));

    // Read word alignments.
    Alignment alignment;
    try {
      alignment = ReadAlignment(alignmentLine);
    } catch (const Exception &e) {
      std::ostringstream s;
      s << "Failed to read alignment at line " << lineNum << ": ";
      s << e.GetMsg();
      Error(s.str());
    }
    if (alignment.size() == 0) {
      std::cerr << "skipping line " << lineNum << " without alignment points\n";
      continue;
    }

    // Record word counts.
    if (!options.unknownWordFile.empty()) {
      CollectWordLabelCounts(*t, options, wordCount, wordLabel);
    }

    // Form an alignment graph from the target tree, source words, and
    // alignment.
    AlignmentGraph graph(t.get(), sourceTokens, alignment);

    // Extract minimal rules, adding each rule to its root node's rule set.
    graph.ExtractMinimalRules(options);

    // Extract composed rules.
    if (!options.minimal) {
      graph.ExtractComposedRules(options);
    }

    // Write the rules, subject to scope pruning.
    const std::vector<Node *> &targetNodes = graph.GetTargetNodes();
    for (std::vector<Node *>::const_iterator p = targetNodes.begin();
         p != targetNodes.end(); ++p) {
      const std::vector<const Subgraph *> &rules = (*p)->GetRules();
      for (std::vector<const Subgraph *>::const_iterator q = rules.begin();
           q != rules.end(); ++q) {
        ScfgRule r(**q);
        // TODO Can scope pruning be done earlier?
        if (r.Scope() <= options.maxScope) {
          writer.Write(r);
        }
      }
    }
  }

  if (!options.glueGrammarFile.empty()) {
    WriteGlueGrammar(labelSet, topLabelSet, glueGrammarStream);
  }

  if (!options.unknownWordFile.empty()) {
    WriteUnknownWordLabel(wordCount, wordLabel, options, unknownWordStream);
  }

  return 0;
}
コード例 #4
0
/* Function: include_alignment()
 * Date:     SRE, Sun Jul  5 15:25:13 1998 [St. Louis]
 *
 * Purpose:  Given the name of a multiple alignment file,
 *           align that alignment to the HMM, and add traces
 *           to an existing array of traces. If do_mapped
 *           is TRUE, we use the HMM's map file. If not,
 *           we use P7ViterbiAlignAlignment().
 *
 * Args:     seqfile  - name of alignment file
 *           hmm      - model to align to
 *           do_mapped- TRUE if we're to use the HMM's alignment map
 *           rsq      - RETURN: array of rseqs to add to
 *           dsq      - RETURN: array of dsq to add to
 *           sqinfo   - RETURN: array of SQINFO to add to
 *           tr       - RETURN: array of traces to add to
 *           nseq     - RETURN: number of seqs           
 *
 * Returns:  new, realloc'ed arrays for rsq, dsq, sqinfo, tr; nseq is
 *           increased to nseq+ainfo.nseq.
 */
void
include_alignment(char *seqfile, struct plan7_s *hmm, int do_mapped,
		  char ***rsq, char ***dsq, SQINFO **sqinfo, 
		  struct p7trace_s ***tr, int *nseq)
{
  int format;			/* format of alignment file */
  char **aseq;			/* aligned seqs             */
  char **newdsq;
  char **newrseq;
  AINFO ainfo;			/* info that goes with aseq */
  int   idx;			/* counter over aseqs       */
  struct p7trace_s *master;     /* master trace             */
  struct p7trace_s **addtr;     /* individual traces for aseq */

  if (! SeqfileFormat(seqfile, &format, NULL))
    switch (squid_errno) {
    case SQERR_NOFILE: 
      ajFatal("Alignment file %s could not be opened for reading", seqfile);
      /*FALLTHRU*/ /* a white lie to shut lint up */
    case SQERR_FORMAT: 
    default:           
      ajFatal("Failed to determine format of alignment file %s", seqfile);
    }
				/* read the alignment from file */
  if (! ReadAlignment(seqfile, format, &aseq, &ainfo))
    ajFatal("Failed to read aligned sequence file %s", seqfile);
  for (idx = 0; idx < ainfo.nseq; idx++)
    s2upper(aseq[idx]);
				/* Verify checksums before mapping */
  if (do_mapped && GCGMultchecksum(aseq, ainfo.nseq) != hmm->checksum)
    ajFatal("The checksums for alignment file %s and the HMM alignment map don't match.", 
	seqfile);
				/* Get a master trace */
  if (do_mapped) master = MasterTraceFromMap(hmm->map, hmm->M, ainfo.alen);
  else           master = P7ViterbiAlignAlignment(aseq, &ainfo, hmm);

				/* convert to individual traces */
  ImposeMasterTrace(aseq, ainfo.nseq, master, &addtr);
				/* add those traces to existing ones */
  *tr = MergeTraceArrays(*tr, *nseq, addtr, ainfo.nseq);
  
				/* additional bookkeeping: add to dsq, sqinfo */
  *rsq = ReallocOrDie((*rsq), sizeof(char *) * (*nseq + ainfo.nseq));
  DealignAseqs(aseq, ainfo.nseq, &newrseq);
  for (idx = *nseq; idx < *nseq + ainfo.nseq; idx++)
    (*rsq)[idx] = newrseq[idx - (*nseq)];
  free(newrseq);

  *dsq = ReallocOrDie((*dsq), sizeof(char *) * (*nseq + ainfo.nseq));
  DigitizeAlignment(aseq, &ainfo, &newdsq);
  for (idx = *nseq; idx < *nseq + ainfo.nseq; idx++)
    (*dsq)[idx] = newdsq[idx - (*nseq)];
  free(newdsq);
				/* unnecessarily complex, but I can't be bothered... */
  *sqinfo = ReallocOrDie((*sqinfo), sizeof(SQINFO) * (*nseq + ainfo.nseq));
  for (idx = *nseq; idx < *nseq + ainfo.nseq; idx++)
    SeqinfoCopy(&((*sqinfo)[idx]), &(ainfo.sqinfo[idx - (*nseq)]));
  
  *nseq = *nseq + ainfo.nseq;

				/* Cleanup */
  P7FreeTrace(master);
  FreeAlignment(aseq, &ainfo);
				/* Return */
  return;
}
コード例 #5
0
ファイル: sqio.c プロジェクト: obbila/CustomWise
/* Function: SeqfileOpen()
 * 
 * Purpose : Open a sequence database file and prepare for reading
 *           sequentially.
 *           
 * Args:     filename - name of file to open
 *           format   - format of file
 *           env      - environment variable for path (e.g. BLASTDB)                     
 *
 *           Returns opened SQFILE ptr, or NULL on failure.
 */
SQFILE *
SeqfileOpen(char *filename, int format, char *env)
{
  SQFILE *dbfp;

  dbfp = (SQFILE *) MallocOrDie (sizeof(SQFILE));
  dbfp->format   = format;
  dbfp->longline = FALSE;

  /* Open our file handle.
   * Three possibilities:
   *    1. normal file open
   *    2. filename = "-";    read from stdin
   *    3. filename = "*.gz"; read thru pipe from gzip 
   * If we're reading from stdin or a pipe, we can't reliably
   * back up, so we can't do two-pass parsers like the interleaved alignment   
   * formats.
   */
  if (strcmp(filename, "-") == 0)
    {
      if (IsInterleavedFormat(format))
	Die("Can't read interleaved alignment formats thru stdin, sorry");

      dbfp->f         = stdin;
      dbfp->do_stdin  = TRUE; 
      dbfp->do_gzip   = FALSE;
    }
  else if (Strparse("^.*\\.gz$", filename, 0) == 0)
    {
      char cmd[256];

      if (IsInterleavedFormat(format))
	Die("Can't read interleaved alignment formats thru gunzip, sorry");

      if (strlen(filename) + strlen("gzip -dc ") >= 256)
	{ squid_errno = SQERR_PARAMETER; return NULL; }
      sprintf(cmd, "gzip -dc %s", filename);
      if ((dbfp->f = popen(cmd, "r")) == NULL)
	{ squid_errno = SQERR_NOFILE; return NULL; } /* file (or gzip!) doesn't exist */
      dbfp->do_stdin = FALSE;
      dbfp->do_gzip  = TRUE;
    }
  else
    {
      if ((dbfp->f = fopen(filename, "r")) == NULL &&
	  (dbfp->f = EnvFileOpen(filename, env)) == NULL)
	{  squid_errno = SQERR_NOFILE; return NULL; }
      dbfp->do_stdin = FALSE;
      dbfp->do_gzip  = FALSE;
    }
  
  /* The hack for sequential access of an interleaved alignment file:
   * read the alignment in, we'll copy sequences out one at a time.
   */
  dbfp->ali_aseqs = NULL;
  if (IsInterleavedFormat(format))
    {
      if (! ReadAlignment(filename, format, &(dbfp->ali_aseqs), &(dbfp->ali_ainfo)))
	return NULL;
      dbfp->ali_curridx = 0;
      return dbfp;
    }

  /* Load the first line.
   */
  getline2(dbfp);

  return dbfp;
}