/* Function: ReadMultipleRseqs() * * Purpose: Open a data file and * parse it into an array of rseqs (raw, unaligned * sequences). * * Caller is responsible for free'ing memory allocated * to ret_rseqs, ret_weights, and ret_names. * * Weights are currently only supported for MSF format. * Sequences read from all other formats will be assigned * weights of 1.0. If the caller isn't interested in * weights, it passes NULL as ret_weights. * * Returns 1 on success. Returns 0 on failure and sets * squid_errno to indicate the cause. */ int ReadMultipleRseqs(char *seqfile, int fformat, char ***ret_rseqs, SQINFO **ret_sqinfo, int *ret_num) { SQINFO *sqinfo; /* array of sequence optional info */ SQFILE *dbfp; /* open ptr for sequential access of file */ char **rseqs; /* sequence array */ char **aseqs; /* aligned sequences, if file is aligned */ AINFO ainfo; /* alignment-associated information */ int numalloced; /* num of seqs currently alloced for */ int idx; int num; if (fformat == kSelex || fformat == kMSF || fformat == kClustal) { if (! ReadAlignment(seqfile, fformat, &aseqs, &ainfo)) return 0; if (! DealignAseqs(aseqs, ainfo.nseq, &rseqs)) return 0; /* copy the sqinfo array */ num = ainfo.nseq; sqinfo= (SQINFO *) MallocOrDie (sizeof(SQINFO)*ainfo.nseq); for (idx = 0; idx < ainfo.nseq; idx++) SeqinfoCopy(&(sqinfo[idx]), &(ainfo.sqinfo[idx])); FreeAlignment(aseqs, &ainfo); } else { /* initial alloc */ num = 0; numalloced = 16; rseqs = (char **) MallocOrDie (numalloced * sizeof(char *)); sqinfo = (SQINFO *) MallocOrDie (numalloced * sizeof(SQINFO)); if ((dbfp = SeqfileOpen(seqfile, fformat, NULL)) == NULL) return 0; while (ReadSeq(dbfp, fformat, &rseqs[num], &(sqinfo[num]))) { num++; if (num == numalloced) /* more seqs coming, alloc more room */ { numalloced += 16; rseqs = (char **) ReallocOrDie (rseqs, numalloced*sizeof(char *)); sqinfo = (SQINFO *) ReallocOrDie (sqinfo, numalloced * sizeof(SQINFO)); } } SeqfileClose(dbfp); } *ret_rseqs = rseqs; *ret_sqinfo = sqinfo; *ret_num = num; return 1; }
// deserializes each alignment and stores them in the supplied vector void CAlignmentReader::ReadAlignments( vector<Alignment>& alignments, const bool& isLongRead, const bool& isPairedInSequencing, const bool& isResolvedAsPair, const unsigned int& readGroupCode, const int& numMate1OriginalAlignments, const int& numMate2OriginalAlignments, const int& numMate1Hashes, const int& numMate2Hashes, const bool& hasCsString) { vector<Alignment>::iterator alIter; for(alIter = alignments.begin(); alIter != alignments.end(); ++alIter) { ReadAlignment(*alIter, isLongRead, isPairedInSequencing, isResolvedAsPair, numMate1OriginalAlignments, numMate2OriginalAlignments, numMate1Hashes, numMate2Hashes, hasCsString); alIter->ReadGroupCode = readGroupCode; } }
int ExtractGHKM::Main(int argc, char *argv[]) { // Process command-line options. Options options; ProcessOptions(argc, argv, options); // Open input files. InputFileStream targetStream(options.targetFile); InputFileStream sourceStream(options.sourceFile); InputFileStream alignmentStream(options.alignmentFile); // Open output files. OutputFileStream fwdExtractStream; OutputFileStream invExtractStream; std::ofstream glueGrammarStream; std::ofstream unknownWordStream; std::string fwdFileName = options.extractFile; std::string invFileName = options.extractFile + std::string(".inv"); if (options.gzOutput) { fwdFileName += ".gz"; invFileName += ".gz"; } OpenOutputFileOrDie(fwdFileName, fwdExtractStream); OpenOutputFileOrDie(invFileName, invExtractStream); if (!options.glueGrammarFile.empty()) { OpenOutputFileOrDie(options.glueGrammarFile, glueGrammarStream); } if (!options.unknownWordFile.empty()) { OpenOutputFileOrDie(options.unknownWordFile, unknownWordStream); } // Target label sets for producing glue grammar. std::set<std::string> labelSet; std::map<std::string, int> topLabelSet; // Word count statistics for producing unknown word labels. std::map<std::string, int> wordCount; std::map<std::string, std::string> wordLabel; std::string targetLine; std::string sourceLine; std::string alignmentLine; XmlTreeParser xmlTreeParser(labelSet, topLabelSet); ScfgRuleWriter writer(fwdExtractStream, invExtractStream, options); size_t lineNum = options.sentenceOffset; while (true) { std::getline(targetStream, targetLine); std::getline(sourceStream, sourceLine); std::getline(alignmentStream, alignmentLine); if (targetStream.eof() && sourceStream.eof() && alignmentStream.eof()) { break; } if (targetStream.eof() || sourceStream.eof() || alignmentStream.eof()) { Error("Files must contain same number of lines"); } ++lineNum; // Parse target tree. if (targetLine.size() == 0) { std::cerr << "skipping line " << lineNum << " with empty target tree\n"; continue; } std::auto_ptr<ParseTree> t; try { t = xmlTreeParser.Parse(targetLine); assert(t.get()); } catch (const Exception &e) { std::ostringstream s; s << "Failed to parse XML tree at line " << lineNum; if (!e.GetMsg().empty()) { s << ": " << e.GetMsg(); } Error(s.str()); } // Read source tokens. std::vector<std::string> sourceTokens(ReadTokens(sourceLine)); // Read word alignments. Alignment alignment; try { alignment = ReadAlignment(alignmentLine); } catch (const Exception &e) { std::ostringstream s; s << "Failed to read alignment at line " << lineNum << ": "; s << e.GetMsg(); Error(s.str()); } if (alignment.size() == 0) { std::cerr << "skipping line " << lineNum << " without alignment points\n"; continue; } // Record word counts. if (!options.unknownWordFile.empty()) { CollectWordLabelCounts(*t, options, wordCount, wordLabel); } // Form an alignment graph from the target tree, source words, and // alignment. AlignmentGraph graph(t.get(), sourceTokens, alignment); // Extract minimal rules, adding each rule to its root node's rule set. graph.ExtractMinimalRules(options); // Extract composed rules. if (!options.minimal) { graph.ExtractComposedRules(options); } // Write the rules, subject to scope pruning. const std::vector<Node *> &targetNodes = graph.GetTargetNodes(); for (std::vector<Node *>::const_iterator p = targetNodes.begin(); p != targetNodes.end(); ++p) { const std::vector<const Subgraph *> &rules = (*p)->GetRules(); for (std::vector<const Subgraph *>::const_iterator q = rules.begin(); q != rules.end(); ++q) { ScfgRule r(**q); // TODO Can scope pruning be done earlier? if (r.Scope() <= options.maxScope) { writer.Write(r); } } } } if (!options.glueGrammarFile.empty()) { WriteGlueGrammar(labelSet, topLabelSet, glueGrammarStream); } if (!options.unknownWordFile.empty()) { WriteUnknownWordLabel(wordCount, wordLabel, options, unknownWordStream); } return 0; }
/* Function: include_alignment() * Date: SRE, Sun Jul 5 15:25:13 1998 [St. Louis] * * Purpose: Given the name of a multiple alignment file, * align that alignment to the HMM, and add traces * to an existing array of traces. If do_mapped * is TRUE, we use the HMM's map file. If not, * we use P7ViterbiAlignAlignment(). * * Args: seqfile - name of alignment file * hmm - model to align to * do_mapped- TRUE if we're to use the HMM's alignment map * rsq - RETURN: array of rseqs to add to * dsq - RETURN: array of dsq to add to * sqinfo - RETURN: array of SQINFO to add to * tr - RETURN: array of traces to add to * nseq - RETURN: number of seqs * * Returns: new, realloc'ed arrays for rsq, dsq, sqinfo, tr; nseq is * increased to nseq+ainfo.nseq. */ void include_alignment(char *seqfile, struct plan7_s *hmm, int do_mapped, char ***rsq, char ***dsq, SQINFO **sqinfo, struct p7trace_s ***tr, int *nseq) { int format; /* format of alignment file */ char **aseq; /* aligned seqs */ char **newdsq; char **newrseq; AINFO ainfo; /* info that goes with aseq */ int idx; /* counter over aseqs */ struct p7trace_s *master; /* master trace */ struct p7trace_s **addtr; /* individual traces for aseq */ if (! SeqfileFormat(seqfile, &format, NULL)) switch (squid_errno) { case SQERR_NOFILE: ajFatal("Alignment file %s could not be opened for reading", seqfile); /*FALLTHRU*/ /* a white lie to shut lint up */ case SQERR_FORMAT: default: ajFatal("Failed to determine format of alignment file %s", seqfile); } /* read the alignment from file */ if (! ReadAlignment(seqfile, format, &aseq, &ainfo)) ajFatal("Failed to read aligned sequence file %s", seqfile); for (idx = 0; idx < ainfo.nseq; idx++) s2upper(aseq[idx]); /* Verify checksums before mapping */ if (do_mapped && GCGMultchecksum(aseq, ainfo.nseq) != hmm->checksum) ajFatal("The checksums for alignment file %s and the HMM alignment map don't match.", seqfile); /* Get a master trace */ if (do_mapped) master = MasterTraceFromMap(hmm->map, hmm->M, ainfo.alen); else master = P7ViterbiAlignAlignment(aseq, &ainfo, hmm); /* convert to individual traces */ ImposeMasterTrace(aseq, ainfo.nseq, master, &addtr); /* add those traces to existing ones */ *tr = MergeTraceArrays(*tr, *nseq, addtr, ainfo.nseq); /* additional bookkeeping: add to dsq, sqinfo */ *rsq = ReallocOrDie((*rsq), sizeof(char *) * (*nseq + ainfo.nseq)); DealignAseqs(aseq, ainfo.nseq, &newrseq); for (idx = *nseq; idx < *nseq + ainfo.nseq; idx++) (*rsq)[idx] = newrseq[idx - (*nseq)]; free(newrseq); *dsq = ReallocOrDie((*dsq), sizeof(char *) * (*nseq + ainfo.nseq)); DigitizeAlignment(aseq, &ainfo, &newdsq); for (idx = *nseq; idx < *nseq + ainfo.nseq; idx++) (*dsq)[idx] = newdsq[idx - (*nseq)]; free(newdsq); /* unnecessarily complex, but I can't be bothered... */ *sqinfo = ReallocOrDie((*sqinfo), sizeof(SQINFO) * (*nseq + ainfo.nseq)); for (idx = *nseq; idx < *nseq + ainfo.nseq; idx++) SeqinfoCopy(&((*sqinfo)[idx]), &(ainfo.sqinfo[idx - (*nseq)])); *nseq = *nseq + ainfo.nseq; /* Cleanup */ P7FreeTrace(master); FreeAlignment(aseq, &ainfo); /* Return */ return; }
/* Function: SeqfileOpen() * * Purpose : Open a sequence database file and prepare for reading * sequentially. * * Args: filename - name of file to open * format - format of file * env - environment variable for path (e.g. BLASTDB) * * Returns opened SQFILE ptr, or NULL on failure. */ SQFILE * SeqfileOpen(char *filename, int format, char *env) { SQFILE *dbfp; dbfp = (SQFILE *) MallocOrDie (sizeof(SQFILE)); dbfp->format = format; dbfp->longline = FALSE; /* Open our file handle. * Three possibilities: * 1. normal file open * 2. filename = "-"; read from stdin * 3. filename = "*.gz"; read thru pipe from gzip * If we're reading from stdin or a pipe, we can't reliably * back up, so we can't do two-pass parsers like the interleaved alignment * formats. */ if (strcmp(filename, "-") == 0) { if (IsInterleavedFormat(format)) Die("Can't read interleaved alignment formats thru stdin, sorry"); dbfp->f = stdin; dbfp->do_stdin = TRUE; dbfp->do_gzip = FALSE; } else if (Strparse("^.*\\.gz$", filename, 0) == 0) { char cmd[256]; if (IsInterleavedFormat(format)) Die("Can't read interleaved alignment formats thru gunzip, sorry"); if (strlen(filename) + strlen("gzip -dc ") >= 256) { squid_errno = SQERR_PARAMETER; return NULL; } sprintf(cmd, "gzip -dc %s", filename); if ((dbfp->f = popen(cmd, "r")) == NULL) { squid_errno = SQERR_NOFILE; return NULL; } /* file (or gzip!) doesn't exist */ dbfp->do_stdin = FALSE; dbfp->do_gzip = TRUE; } else { if ((dbfp->f = fopen(filename, "r")) == NULL && (dbfp->f = EnvFileOpen(filename, env)) == NULL) { squid_errno = SQERR_NOFILE; return NULL; } dbfp->do_stdin = FALSE; dbfp->do_gzip = FALSE; } /* The hack for sequential access of an interleaved alignment file: * read the alignment in, we'll copy sequences out one at a time. */ dbfp->ali_aseqs = NULL; if (IsInterleavedFormat(format)) { if (! ReadAlignment(filename, format, &(dbfp->ali_aseqs), &(dbfp->ali_ainfo))) return NULL; dbfp->ali_curridx = 0; return dbfp; } /* Load the first line. */ getline2(dbfp); return dbfp; }