/* Reads input from files and initializes table with number of rows and number of columns equal to length of s1 + 1 and length of s2 + 1 respectively. */ DPTable::DPTable(const string & inputFile, const string & paramsFile) : s1(""), s1Name(""), s2(""), s2Name(""), s1Out(""), cnxnStr(""), s2Out(""), table(0), scoreParams(), globalOpt(0), localOpt(0), numMatches(0), numMismatches(0), numOpenGaps(0), numGaps(0) { readFastaFile(inputFile); try { readParamsFile(paramsFile); } catch (const char* m) { cerr << m << endl; scoreParams.setToDefault(); } table.resize(s1.length() + 1); for (int i = 0; i < table.size(); i++) { table[i].resize(s2.length() + 1); } }
void AssemblyGraph::buildDeBruijnGraphFromTrinityFasta(QString fullFileName) { m_graphFileType = TRINITY; std::vector<QString> names; std::vector<QString> sequences; readFastaFile(fullFileName, &names, &sequences); std::vector<QString> edgeStartingNodeNames; std::vector<QString> edgeEndingNodeNames; for (size_t i = 0; i < names.size(); ++i) { QString name = names[i]; QString sequence = sequences[i]; //The header can come in a couple of different formats: // TR1|c0_g1_i1 len=280 path=[274:0-228 275:229-279] [-1, 274, 275, -2] // GG1|c0_g1_i1 len=302 path=[1:0-301] // comp0_c0_seq1 len=286 path=[6:0-285] // c0_g1_i1 len=363 path=[119:0-185 43:186-244 43:245-303 43:304-362] //The node names will begin with a string that contains everything //up to the component number (e.g. "c0"), in the same format as it is //in the Trinity.fasta file. if (name.length() < 4) throw "load error"; int componentStartIndex = name.indexOf(QRegExp("c\\d+_")); int componentEndIndex = name.indexOf("_", componentStartIndex); if (componentStartIndex < 0 || componentEndIndex < 0) throw "load error"; QString component = name.left(componentEndIndex); if (component.length() < 2) throw "load error"; int pathStartIndex = name.indexOf("path=[") + 6; int pathEndIndex = name.indexOf("]", pathStartIndex); if (pathStartIndex < 0 || pathEndIndex < 0) throw "load error"; int pathLength = pathEndIndex - pathStartIndex; QString path = name.mid(pathStartIndex, pathLength); if (path.size() == 0) throw "load error"; QStringList pathParts = path.split(" "); //Each path part is a node QString previousNodeName; for (int i = 0; i < pathParts.length(); ++i) { QString pathPart = pathParts.at(i); QStringList nodeParts = pathPart.split(":"); if (nodeParts.size() < 2) throw "load error"; //Most node numbers will be formatted simply as the number, but some //(I don't know why) have '@' and the start and '@!' at the end. In //these cases, we must strip those extra characters off. QString nodeNumberString = nodeParts.at(0); if (nodeNumberString.at(0) == '@') nodeNumberString = nodeNumberString.mid(1, nodeNumberString.length() - 3); QString nodeName = component + "_" + nodeNumberString + "+"; //If the node doesn't yet exist, make it now. if (!m_deBruijnGraphNodes.contains(nodeName)) { QString nodeRange = nodeParts.at(1); QStringList nodeRangeParts = nodeRange.split("-"); if (nodeRangeParts.size() < 2) throw "load error"; int nodeRangeStart = nodeRangeParts.at(0).toInt(); int nodeRangeEnd = nodeRangeParts.at(1).toInt(); int nodeLength = nodeRangeEnd - nodeRangeStart + 1; QByteArray nodeSequence = sequence.mid(nodeRangeStart, nodeLength).toLocal8Bit(); DeBruijnNode * node = new DeBruijnNode(nodeName, 0.0, nodeSequence); m_deBruijnGraphNodes.insert(nodeName, node); } //Remember to make an edge for the previous node to this one. if (i > 0) { edgeStartingNodeNames.push_back(previousNodeName); edgeEndingNodeNames.push_back(nodeName); } previousNodeName = nodeName; } } //Even though the Trinity.fasta file only contains positive nodes, Bandage //expects negative reverse complements nodes, so make them now. QMapIterator<QString, DeBruijnNode*> i(m_deBruijnGraphNodes); while (i.hasNext()) { i.next(); DeBruijnNode * node = i.value(); makeReverseComplementNodeIfNecessary(node); } pointEachNodeToItsReverseComplement(); //Create all of the edges. The createDeBruijnEdge function checks for //duplicates, so it's okay if we try to add the same edge multiple times. for (size_t i = 0; i < edgeStartingNodeNames.size(); ++i) { QString node1Name = edgeStartingNodeNames[i]; QString node2Name = edgeEndingNodeNames[i]; createDeBruijnEdge(node1Name, node2Name); } setAllEdgesExactOverlap(0); if (m_deBruijnGraphNodes.size() == 0) throw "load error"; }
int main(int argc, char *argv[]) { // Parse arguments if (argc != 3) { usage(argv); return 1; } // You would load a custom HMM here if you wanted using // hmm_getStateMachine (see the realign code) StateMachine *stateMachine = stateMachine5_construct(fiveState); PairwiseAlignmentParameters *parameters = pairwiseAlignmentBandingParameters_construct(); stHash *targetSequences = readFastaFile(argv[1]); stHash *querySequences = readFastaFile(argv[2]); // For each query sequence, align it against all target sequences. stHashIterator *queryIt = stHash_getIterator(querySequences); char *queryHeader; while ((queryHeader = stHash_getNext(queryIt)) != NULL) { char *querySeq = stHash_search(querySequences, queryHeader); stHashIterator *targetIt = stHash_getIterator(targetSequences); char *targetHeader; while ((targetHeader = stHash_getNext(targetIt)) != NULL) { char *targetSeq = stHash_search(targetSequences, targetHeader); // Here we should try both the target sequence and its // reverse-complemented version // Aligns the sequences. // If you have alignment constraints (anchors) you should // replace this with getAlignedPairsUsingAnchors. stList *alignedPairs = getAlignedPairs(stateMachine, targetSeq, querySeq, parameters, true, true); // Takes into account the probability of aligning to a // gap, by transforming the posterior probability into the // AMAP objective function (see Schwartz & Pachter, 2007). alignedPairs = reweightAlignedPairs2(alignedPairs, strlen(targetSeq), strlen(querySeq), parameters->gapGamma); // I think this calculates the optimal ordered set of // alignments from the unordered set of aligned pairs, not // completely sure. alignedPairs = filterPairwiseAlignmentToMakePairsOrdered(alignedPairs, targetSeq, querySeq, // This parameter says that the minimum posterior probability we will accept has to be at least 0.9. 0.9); // After this the "aligned pairs" data structure changes, // which is a little sketchy. It's just so that the // alignment can be printed properly. stList_mapReplace(alignedPairs, convertToAnchorPair, NULL); stList_sort(alignedPairs, (int (*)(const void *, const void *)) stIntTuple_cmpFn); struct PairwiseAlignment *alignment = convertAlignedPairsToPairwiseAlignment(targetHeader, queryHeader, 0, strlen(targetSeq), strlen(querySeq), alignedPairs); // Output the cigar string cigarWrite(stdout, alignment, 0); stList_destruct(alignedPairs); destructPairwiseAlignment(alignment); } stHash_destructIterator(targetIt); } stHash_destructIterator(queryIt); // Clean up stHash_destruct(targetSequences); stHash_destruct(querySequences); pairwiseAlignmentBandingParameters_destruct(parameters); stateMachine_destruct(stateMachine); }
ProteinSequence::ProteinSequence(string sequenceFileName, stringstream& priorStream, ostream& _logFile) : logFile(_logFile), muPrior(priorStream), taoPrior(C2, taoHyperParam), nuPrior(C2, nuHyperParam) { readFastaFile (sequenceFileName); //muPrior.show(logFile); }