int main() {
	int runs;
	scanf("%d", &runs);
	for (int run = 1; run <= runs; ++run) {
		scanf("%s", str);
		int ls = strlen(str);
		for (int i = 0; i < ls; ++i) num[i] = str[i];
		num[ls] = 1;
		for (int i = 0; i < ls; ++i) num[ls + 1 + i] = str[ls - i - 1];
		num[ls+ls+1] = 0;
		sa.init(num, ls + ls + 1, 128);
		sa.init_rmq();
		memset(vis, 0, sizeof(vis));
		int ans = 0, cnt = 0;
		for (int i = 1; i <= ls+ls; ++i) {
			cnt = min(cnt, sa.height[i]);
			if (!vis[sa[i]] && vis[ls+ls+1-sa[i]+1]) {
				int t = sa.lcp(sa[i], ls+ls+1-sa[i]+1);
				if (t <= cnt) continue;
				ans += t - cnt;
				if (cnt < t) ans = t;
			} else vis[sa[i]] = 1;
		}
		printf("%d\n", ans);
	}
	return 0;
}
Пример #2
0
int main() {
    SuffixArray in;
    while(gets(in.str) && in.str[0] != '\0') {
    	int n = 0;
    	for(int i = 0; in.str[i]; i++)
    		if(in.str[i] != ' ')
    			in.str[n++] = in.str[i];
    	in.str[n] = '\0';
    	
        in.build();
        in.build_h();
        
        if(n == 0)
        	puts("0");
        for(int i = 1; i <= in.n; i++) {
        	int cnt = 0, ret = 0;
        	for(int j = 0; j < in.n; j++) {
        		if(in.h[j] >= i)
        			cnt++;
        		else
        			ret = max(ret, cnt), cnt = 0;
        	}
        	ret = max(ret, cnt);
        	if(ret <= 0)
        		break;
        	printf("%d\n", ret + 1);
        }
        puts("");
    }
    return 0;
}
Пример #3
0
// Validate the sampled suffix array values are correct
void SampledSuffixArray::validate(const std::string filename, const BWT* pBWT)
{
    ReadTable* pRT = new ReadTable(filename);
    SuffixArray* pSA = new SuffixArray(pRT, 1);
    
    std::cout << "Validating sampled suffix array entries\n";

    for(size_t i = 0; i < pSA->getSize(); ++i)
    {
        SAElem calc = calcSA(i, pBWT);
        SAElem real = pSA->get(i);
        if(calc.getID() != real.getID() || calc.getPos() != real.getPos())
        {
            std::cout << "Error SA elements do not match for " << i << "\n";
            std::cout << "Calc: " << calc << "\n";
            std::cout << "Real: " << real << "\n";
            exit(1);
        }
    }
    
    std::cout << "All calculate SA values are correct\n";

    delete pRT;
    delete pSA;
}
Пример #4
0
int main()
{
    scanf("%s", buf);
    SuffixArray sa;
    sa.create(buf);
    sa.output();
    
    return 0;
} 
Пример #5
0
int main() {
    int n, m, l, r;
    set<pair<int, int> > st;

    scanf("%d%d", &n, &m);
    scanf("%s", buf);
    sa.init(n, buf);
    rmq.init(n, sa.height);

    l = 0;
    r = 1;
    for (int i = 0; i < m; ++i) {
        scanf("%s", op);
        int& t = op[0] == 'L' ? l : r;
        t += op[1] == '+' ? 1 : -1;
        int k = sa.rank[l];
        int lo = 0, hi = k;
        while (lo < hi) {
            int mi = (lo + hi) / 2;
            if (rmq.value(mi + 1, k + 1) >= r - l) {
                hi = mi;
            } else {
                lo = mi + 1;
            }
        }
        st.insert(make_pair(hi, r - l));
    }
    printf("%d\n", (int)st.size());

    return 0;
}
Пример #6
0
int main() {
    int i, first = 0;
    while(scanf("%d", &n) == 1 && n) {
        if(first)   puts("");
        first = 1;
        int m = 0;
        half = n/2; // > half
        int mxlen = 0;
        for(i = 0; i < n; i++) {
            scanf("%s", SA.str+m);
            int cnt = 0;
            while(SA.str[m])    Sfrom[m] = i, m++, cnt++;
            if(cnt > mxlen) mxlen = cnt;
            Wlen[i] = cnt;
            SA.str[m++] = '$';
            SA.str[m] = 0;
        }
        SA.str[m-1] = '\0';
        if(n == 1) {
            puts(SA.str);
            continue;
        }
        //puts(SA.str);
        SA.build();
        SA.build_h();
        int l = 1, r = mxlen;
        int res = 0;
        while(l <= r) {
            m = (l+r)/2;
            if(check(m, 0)) {
                l = m+1;
                if(m > res)
                    res = m;
            } else {
                r = m-1;
            }
        }
        if(res == 0)
            puts("?");
        else
            check(res, 1);
    }
    return 0;
}
Пример #7
0
void test1(wordstring& ws, intstring& ids)
{
   struct timeval start;
   struct timeval end;
   SuffixArray sa;
   gettimeofday(&start,NULL);
   sa.DA(ids); 
   gettimeofday(&end,NULL);
   double dur = 0;
   dur += (end.tv_sec-start.tv_sec)*1000000+(end.tv_usec-start.tv_usec);
   cout<<dur/1000000<<endl;
   cerr << endl;
   vector<RepeatSubString> repeat;
   CaculateRepeatSubString(sa, repeat, 5); 
   for (int i = 0; i < repeat.size(); ++i)
   {
       cout << repeat[i] << repeat[i].ToString(ws) << endl;
   }
}
Пример #8
0
void buildIndexForTable(std::string prefix, const ReadTable* pRT, bool isReverse)
{
    // Create suffix array from read table
    SuffixArray* pSA = new SuffixArray(pRT, opt::numThreads);

    if(opt::validate)
    {
        std::cout << "Validating suffix array\n";
        pSA->validate(pRT);
    }

    std::string bwt_filename = prefix + (!isReverse ? BWT_EXT : RBWT_EXT);
    pSA->writeBWT(bwt_filename, pRT);

    std::string sufidx_filename = prefix + (!isReverse ? SAI_EXT : RSAI_EXT);
    pSA->writeIndex(sufidx_filename);

    delete pSA;
    pSA = NULL;
}
void
print_ranks(std::string const &s, SuffixArray &sa) {
    int n = sa.dp.empty() ? 0 : sa.dp[0].size();
    for (int r = 0; r < n; ++r) {
        printf("%3c", s[r]);
        for (size_t c = 0; c < sa.dp.size(); ++c) {
            printf("%4d", sa.get_dp(c)[r]);
        }
        printf("\n");
    }
}
Пример #10
0
 static Int find_max_length( const SuffixArray& sa, const String& s ) {
   Int len = 0;
   int n = s.size();
   for ( int i = 0; i + len < n; ) {
     if ( sa.find(s.substr(i, len + 1)) ) {
       len ++;
     } else {
       i ++;
     }
   }
   return len;
 }
void
print_suffix_array(std::string const &s, SuffixArray &sa, vi_t *plcp = NULL) {
    vi_t pos;
    sa.sorted_indexes(pos);
    for (size_t i = 0; i < pos.size(); ++i) {
        // Limit each line to 60 characters
        if (plcp) {
            printf("%3d: [%2d]: %s\n", pos[i], (*plcp)[pos[i]], s.substr(pos[i], 60).c_str());
        }
        else {
            printf("%3d: %s\n", pos[i], s.substr(pos[i], 60).c_str());
        }
    }
}
std::vector<int> longest_common_prefix(const T &s, const SuffixArray &sa){
	const int n = sa.size();
	std::vector<int> vs(n), isa(n), lcp(n - 1);
	for(int i = 0; i + 1 < n; ++i){ vs[i] = s[i]; }
	for(int i = 0; i < n; ++i){ isa[sa[i]] = i; }
	int h = 0;
	for(int i = 0; i < n; ++i){
		const int j = isa[i];
		if(j > 0){
			const int k = j - 1;
			while(vs[sa[j] + h] == vs[sa[k] + h]){ ++h; }
			lcp[k] = h;
			if(h > 0){ --h; }
		}
	}
	return lcp;
}
Пример #13
0
/** Add the overlaps of vseq to the graph. */
static void addOverlapsSA(Graph& g, const SuffixArray& sa,
		ContigNode v, const string& vseq)
{
	assert(!vseq.empty());
	set<ContigNode> seen;
	typedef SuffixArray::const_iterator It;
	for (string q(vseq, 0, vseq.size() - 1);
			q.size() >= opt::minOverlap; chop(q)) {
		pair<It, It> range = sa.equal_range(q);
		for (It it = range.first; it != range.second; ++it) {
			ContigNode u(it->second);
			if (seen.insert(u).second) {
				// Add the longest overlap between two vertices.
				unsigned overlap = it->first.size();
				add_edge(u, v, -overlap, static_cast<DG&>(g));
			}
		}
	}
}
Пример #14
0
void RunTest(SuffixArray &index, const context_t *context,
             const unordered_map<vector<wid_t>, size_t, phrase_hash> &ngrams, vector<speed_perf_t> &speedData) {
    size_t queryCount = 0;

    for (auto entry = ngrams.begin(); entry != ngrams.end(); ++entry) {
        Collector *collector = index.NewCollector(context, true);

        for (size_t i = 0; i < entry->first.size(); ++i) {
            double begin = GetTime();
            vector<sample_t> samples;
            collector->Extend(entry->first[i], 1000, samples);
            speedData[i].seconds += GetElapsedTime(begin);
            speedData[i].requests++;

            queryCount++;

            if (queryCount % 10000 == 0)
                cout << "." << flush;
        }

        delete collector;
    }
}
Пример #15
0
void ILCPConstruct(const SuffixArray& sa,
                   std::vector<SuffixArray::Index>* ilcp) {
  typedef SuffixArray::Index Index;
  std::vector<Index>& text_lcp = *ilcp;
  text_lcp.resize(sa.size());
  Index start = 0;
  int num_docs = 0;
  const char* text = sa.text();
  for (Index i = 0; i <= (Index)sa.size(); ++i) {
    if (i == (Index)sa.size() || (unsigned char)text[i] <= 1) {
      const char* doc = text + start;
      Index doc_len = i - start;
      SuffixArray doc_sa(doc, doc_len);
      for (Index j = 0; j < doc_len; ++j) {
        Index p = doc_sa.sa(j);
        Index lcp = doc_sa.lcp(j);
        text_lcp[start + p] = lcp;
      }
      num_docs++;
      start = i;
    }
  }
  std::vector<bool> visited(sa.size());
  // permutate text_lcp[i] = text_lcp[sa[i]] implace
  for (Index i = 0; i < (Index)sa.size(); ++i) {
    if (!visited[i]) {
      int j = i;
      while (true) {
        visited[j] = 1;
        Index to = sa.sa(j);
        if (visited[to]) break;
        std::swap(text_lcp[j], text_lcp[to]);
        j = to;
      }
    }
    // ilcp[i] = text_lcp[sa.sa(i)];
  }
}
Пример #16
0
std::string parseDupHits(const StringVector& hitsFilenames, const std::string& out_prefix)
{
    // Load the suffix array index and the reverse suffix array index
    // Note these are not the full suffix arrays
    SuffixArray* pFwdSAI = new SuffixArray(opt::prefix + SAI_EXT);
    SuffixArray* pRevSAI = new SuffixArray(opt::prefix + RSAI_EXT);

    // Load the read table to look up the lengths of the reads and their ids.
    // When rmduping a set of reads, the ReadInfoTable can actually be larger than the
    // BWT if the names of the reads are very long. Previously, when two reads
    // are duplicated, the read with the lexographically lower read name was chosen
    // to be kept. To save memory here, we break ties using the index in the ReadInfoTable
    // instead. This allows us to avoid loading the read names.
    ReadInfoTable* pRIT = new ReadInfoTable(opt::readsFile, pFwdSAI->getNumStrings(), RIO_NUMERICID);

    std::string outFile = out_prefix + ".fa";
    std::string dupFile = out_prefix + ".dups.fa";
    std::ostream* pWriter = createWriter(outFile);
    std::ostream* pDupWriter = createWriter(dupFile);

    size_t substringRemoved = 0;
    size_t identicalRemoved = 0;
    size_t kept = 0;
    size_t buffer_size = SequenceProcessFramework::BUFFER_SIZE;

    // The reads must be output in their original ordering.
    // The hits are in the blocks of buffer_size items. We read
    // buffer_size items from the first hits file, then buffer_size
    // from the second and so on until all the hits have been processed.
    size_t num_files = hitsFilenames.size();
    std::vector<std::istream*> reader_vec(num_files, 0);

    for(size_t i = 0; i < num_files; ++i)
    {
        std::cout << "Opening " << hitsFilenames[i] << "\n";
        reader_vec[i] = createReader(hitsFilenames[i]);
    }

    bool done = false;
    size_t currReaderIdx = 0;
    size_t numRead = 0;
    size_t numReadersDone = 0;
    std::string line;

    while(!done)
    {
        // Parse a line from the current file
        bool valid = getline(*reader_vec[currReaderIdx], line);
        ++numRead;
        // Deal with switching the active reader and the end of files
        if(!valid || numRead == buffer_size)
        {
            // Switch the reader
            currReaderIdx = (currReaderIdx + 1) % num_files;
            numRead = 0;

            // Break once all the readers are invalid
            if(!valid)
            {
                ++numReadersDone;
                if(numReadersDone == num_files)
                {
                    done = true;
                    break;
                }
            }
        }

        // Parse the data
        if(valid)
        {
            std::string id;
            std::string sequence;
            std::string hitsStr;
            size_t readIdx;
            size_t numCopies;
            bool isSubstring;

            std::stringstream parser(line);
            parser >> id;
            parser >> sequence;
            getline(parser, hitsStr);

            OverlapVector ov;
            OverlapCommon::parseHitsString(hitsStr, pRIT, pRIT, pFwdSAI, pRevSAI, true, readIdx, numCopies, ov, isSubstring);
            
            bool isContained = false;
            if(isSubstring)
            {
                ++substringRemoved;
                isContained = true;
            }
            else
            {
                for(OverlapVector::iterator iter = ov.begin(); iter != ov.end(); ++iter)
                {
                    if(iter->isContainment() && iter->getContainedIdx() == 0)
                    {
                        // This read is contained by some other read
                        ++identicalRemoved;
                        isContained = true;
                        break;
                    }
                }
            }

            SeqItem item = {id, sequence};
            std::stringstream meta;
            meta << id << " NumDuplicates=" << numCopies;

            if(isContained)
            {
                // The read's index in the sequence data base
                // is needed when removing it from the FM-index.
                // In the output fasta, we set the reads ID to be the index
                // and record its old id in the fasta header.
                std::stringstream newID;
                newID << item.id << ",seqrank=" << readIdx;
                item.id = newID.str();

                // Write some metadata with the fasta record
                item.write(*pDupWriter, meta.str());
            }
            else
            {
                ++kept;
                // Write the read
                item.write(*pWriter, meta.str());
            }
        }
    }

    for(size_t i = 0; i < num_files; ++i)
    {
        delete reader_vec[i];
        unlink(hitsFilenames[i].c_str());
    }

    
    printf("[%s] Removed %zu substring reads\n", PROGRAM_IDENT, substringRemoved);
    printf("[%s] Removed %zu identical reads\n", PROGRAM_IDENT, identicalRemoved);
    printf("[%s] Kept %zu reads\n", PROGRAM_IDENT, kept);

    // Delete allocated data
    delete pFwdSAI;
    delete pRevSAI;
    delete pRIT;
    delete pWriter;
    delete pDupWriter;

    return dupFile;
}
Пример #17
0
// The algorithm is as follows. We create M BWTs for subsets of 
// the input reads. These are created independently and written
// to disk. They are then merged either sequentially or pairwise
// to create the final BWT
void buildBWTDisk(const std::string& in_filename, const std::string& out_prefix, 
                  const std::string& bwt_extension, const std::string& sai_extension,
                  bool doReverse, int numThreads, int numReadsPerBatch, int storageLevel)
{
    size_t MAX_READS_PER_GROUP = numReadsPerBatch;

    SeqReader* pReader = new SeqReader(in_filename);
    SeqRecord record;

    int groupID = 0;
    size_t numReadTotal = 0;

    MergeVector mergeVector;
    MergeItem mergeItem;
    mergeItem.start_index = 0;

    // Phase 1: Compute the initial BWTs
    ReadTable* pCurrRT = new ReadTable;
    bool done = false;
    while(!done)
    {
        done = !pReader->get(record);

        if(!done)
        {
            // the read is valid
            SeqItem item = record.toSeqItem();
            if(doReverse)
                item.seq.reverse();
            pCurrRT->addRead(item);
            ++numReadTotal;
        }

        if(pCurrRT->getCount() >= MAX_READS_PER_GROUP || (done && pCurrRT->getCount() > 0))
        {
            // Compute the SA and BWT for this group
            SuffixArray* pSA = new SuffixArray(pCurrRT, numThreads);

            // Write the BWT to disk                
            std::string bwt_temp_filename = makeTempName(out_prefix, groupID, bwt_extension);
            pSA->writeBWT(bwt_temp_filename, pCurrRT);

            std::string sai_temp_filename = makeTempName(out_prefix, groupID, sai_extension);
            pSA->writeIndex(sai_temp_filename);

            // Push the merge info
            mergeItem.end_index = numReadTotal - 1; // inclusive
            mergeItem.reads_filename = in_filename;
            mergeItem.bwt_filename = bwt_temp_filename;
            mergeItem.sai_filename = sai_temp_filename;
            mergeVector.push_back(mergeItem);

            // Cleanup
            delete pSA;

            // Start the new group
            mergeItem.start_index = numReadTotal;
            ++groupID;
            pCurrRT->clear();
        }
    }
    delete pCurrRT;
    delete pReader;

    // Phase 2: Pairwise merge the BWTs
    int round = 1;
    MergeVector nextMergeRound;
    while(mergeVector.size() > 1)
    {
        std::cout << "Starting round " << round << "\n";
        pReader = new SeqReader(in_filename);
        for(size_t i = 0; i < mergeVector.size(); i+=2)
        {
            if(i + 1 != mergeVector.size())
            {
                std::string bwt_merged_name = makeTempName(out_prefix, groupID, bwt_extension);
                std::string sai_merged_name = makeTempName(out_prefix, groupID, sai_extension);

                MergeItem item1 = mergeVector[i];
                MergeItem item2 = mergeVector[i+1];

                // Perform the actual merge
                int64_t curr_idx = merge(pReader, item1, item2, 
                                         bwt_merged_name, sai_merged_name, 
                                         doReverse, numThreads, storageLevel);

                // pReader now points to the end of item1's block of 
                // reads. Skip item2's reads
                assert(curr_idx == item2.start_index);
                while(curr_idx <= item2.end_index)
                {
                    bool eof = !pReader->get(record);
                    assert(!eof);
                    (void)eof;
                    ++curr_idx;
                }

                // Create the merged mergeItem to use in the next round
                MergeItem merged;
                merged.start_index = item1.start_index;
                merged.end_index = item2.end_index;
                merged.bwt_filename = bwt_merged_name;
                merged.sai_filename = sai_merged_name;
                nextMergeRound.push_back(merged);

                // Done with the temp files, remove them
                unlink(item1.bwt_filename.c_str());
                unlink(item2.bwt_filename.c_str());
                unlink(item1.sai_filename.c_str());
                unlink(item2.sai_filename.c_str());

                ++groupID;
            }
            else
            {
                // Singleton, pass through to the next round
                nextMergeRound.push_back(mergeVector[i]);
            }
        }
        delete pReader;
        mergeVector.clear();
        mergeVector.swap(nextMergeRound);
        ++round;
    }
    assert(mergeVector.size() == 1);

    // Done, rename the files to their final name
    std::stringstream bwt_ss;
    bwt_ss << out_prefix << bwt_extension << (USE_GZ ? ".gz" : "");
    std::string bwt_final_filename = bwt_ss.str();
    rename(mergeVector.front().bwt_filename.c_str(), bwt_final_filename.c_str());

    std::stringstream sai_ss;
    sai_ss << out_prefix << sai_extension << (USE_GZ ? ".gz" : "");
    std::string sai_final_filename = sai_ss.str();
    rename(mergeVector.front().sai_filename.c_str(), sai_final_filename.c_str());
}
Пример #18
0
 static void init_suffix_array( const String& s, SuffixArray& sa ) {
   sa.init(s);
   sa.build();
   sa.buildHeight();
 }
Пример #19
0
int main(int argc, char* argv[]) {
  // handle parameters
  string query;
  string fileNameSuffix;
  string fileNameSource;
  int loadFlag = false;
  int saveFlag = false;
  int createFlag = false;
  int queryFlag = false;
  int stdioFlag = false;  // receive requests from STDIN, respond to STDOUT
  string info = "usage: biconcor\n\t[--load model-file]\n\t[--save model-file]\n\t[--create corpus]\n\t[--query string]\n\t[--stdio]\n";
  while(1) {
    static struct option long_options[] = {
      {"load", required_argument, 0, 'l'},
      {"save", required_argument, 0, 's'},
      {"create", required_argument, 0, 'c'},
      {"query", required_argument, 0, 'q'},
      {"stdio", no_argument, 0, 'i'},
      {0, 0, 0, 0}
    };
    int option_index = 0;
    int c = getopt_long (argc, argv, "l:s:c:q:i", long_options, &option_index);
    if (c == -1) break;
    switch (c) {
    case 'l':
      fileNameSuffix = string(optarg);
      loadFlag = true;
      break;
    case 's':
      fileNameSuffix = string(optarg);
      saveFlag = true;
      break;
    case 'c':
      fileNameSource = string(optarg);
      createFlag = true;
      break;
    case 'q':
      query = string(optarg);
      queryFlag = true;
      break;
    case 'i':
      stdioFlag = true;
      break;
    default:
      cerr << info;
      exit(1);
    }
  }
  if (stdioFlag) {
    queryFlag = true;
  }

  // check if parameter settings are legal
  if (saveFlag && !createFlag) {
    cerr << "error: cannot save without creating\n" << info;
    exit(1);
  }
  if (saveFlag && loadFlag) {
    cerr << "error: cannot load and save at the same time\n" << info;
    exit(1);
  }
  if (!loadFlag && !createFlag) {
    cerr << "error: neither load or create - i have no info!\n" << info;
    exit(1);
  }

  // do your thing
  if (createFlag) {
    cerr << "will create\n";
    cerr << "corpus is in " << fileNameSource << endl;
    suffixArray.Create( fileNameSource );
    if (saveFlag) {
      suffixArray.Save( fileNameSuffix );
      cerr << "will save in " << fileNameSuffix << endl;
    }
  }
  if (loadFlag) {
    cerr << "will load from " << fileNameSuffix << endl;
    suffixArray.Load( fileNameSuffix );
  }
  if (stdioFlag) {
    while(true) {
      string query;
      if (getline(cin, query, '\n').eof()) {
        return 0;
      }
      cout << lookup( query ) << endl;
    }
  } 
  else if (queryFlag) {
    cout << lookup( query ) << endl;
  }
  return 0;
}
Пример #20
0
size_t lookup( string query ) {
  cerr << "query is " << query << endl;
  vector< string > queryString = tokenize( query.c_str() );
  return suffixArray.Count( queryString );
}
Пример #21
0
int main(int argc, char* argv[]) 
{
	// handle parameters
	string query;
	string fileNameSuffix;
	string fileNameSource;
	string fileNameTarget = "";
	string fileNameAlignment = "";
	int loadFlag = false;
	int saveFlag = false;
	int createFlag = false;
	int queryFlag = false;
	int htmlFlag = false;
	string info = "usage: suffix-query\n\t[--load file]\n\t[--save file]\n\t[--create source-corpus]\n\t[--query string]\n\t[--target target-corpus]\n\t[--alignment file]\n";
	while(1) {
		static struct option long_options[] = {
			{"load", required_argument, 0, 'l'},
			{"save", required_argument, 0, 's'},
			{"create", required_argument, 0, 'c'},
			{"query", required_argument, 0, 'q'},
			{"target", required_argument, 0, 't'},
			{"alignment", required_argument, 0, 'a'},
			{"html", no_argument, &htmlFlag, 0},
			{0, 0, 0, 0}
		};
		int option_index = 0;
		int c = getopt_long (argc, argv, "l:s:c:q:t:a:h", long_options, &option_index);
		if (c == -1) break;
		switch (c) {
			case 'l':
				fileNameSuffix = string(optarg);
				loadFlag = true;
				break;
			case 't':
				fileNameTarget = string(optarg);
				break;
			case 'a':
				fileNameAlignment = string(optarg);
				break;
			case 's':
				fileNameSuffix = string(optarg);
				saveFlag = true;
				break;
			case 'c':
				fileNameSource = string(optarg);
				createFlag = true;
				break;
			case 'q':
				query = string(optarg);
				queryFlag = true;
				break;
			default:
				cerr << info;
				exit(1);
		}
	}		
  
	// check if parameter settings are legal
	if (saveFlag && !createFlag) {
		cerr << "error: cannot save without creating\n" << info;
		exit(1);
	}
	if (saveFlag && loadFlag) {
		cerr << "error: cannot load and save at the same time\n" << info;
		exit(1);
	}
	if (!loadFlag && !createFlag) {
		cerr << "error: neither load or create - i have no info!\n" << info;
		exit(1);
	}
	if (createFlag && (fileNameTarget == "" || fileNameAlignment == "")) {
		cerr << "error: i have no target corpus or alignment\n" << info;
		exit(1);		
	}

	// do your thing
	SuffixArray suffixArray;
	TargetCorpus targetCorpus;
	Alignment alignment;
	if (createFlag) {
		cerr << "will create\n";
		cerr << "source corpus is in " << fileNameSource << endl;
		suffixArray.Create( fileNameSource );
		cerr << "target corpus is in " << fileNameTarget << endl;
		targetCorpus.Create( fileNameTarget );
		cerr << "alignment is in " << fileNameAlignment << endl;
		alignment.Create( fileNameAlignment );
		if (saveFlag) {
			suffixArray.Save( fileNameSuffix );
			targetCorpus.Save( fileNameSuffix );
			alignment.Save( fileNameSuffix );
			cerr << "will save in " << fileNameSuffix << endl;
		}
	}
	if (loadFlag) {
		cerr << "will load from " << fileNameSuffix << endl;
		suffixArray.Load( fileNameSuffix );
		targetCorpus.Load( fileNameSuffix );
		alignment.Load( fileNameSuffix );
	}
	if (queryFlag) {
		cerr << "query is " << query << endl;
		vector< string > queryString = alignment.Tokenize( query.c_str() );
		PhrasePairCollection ppCollection( &suffixArray, &targetCorpus, &alignment );
		ppCollection.GetCollection( queryString );
		ppCollection.PrintHTML();
	}
}
void
print_pairwise_lcp(std::string const &s, SuffixArray &sa) {
    vi_t pos, lcp;
    sa.lcp_pairwise(pos, lcp);
    print_suffix_array(s, sa, &lcp);
}
Пример #23
0
int main(int argc, char* argv[])
{
    // handle parameters
    string query;
    string fileNameSuffix;
    string fileNameSource;
    string fileNameTarget = "";
    string fileNameAlignment = "";
    int loadFlag = false;
    int saveFlag = false;
    int createFlag = false;
    int queryFlag = false;
    int htmlFlag = false;   // output as HTML
    int prettyFlag = false; // output readable on screen
    int stdioFlag = false;  // receive requests from STDIN, respond to STDOUT
    int max_translation = 20;
    int max_example = 50;
    string info = "usage: biconcor\n\t[--load model-file]\n\t[--save model-file]\n\t[--create source-corpus]\n\t[--query string]\n\t[--target target-corpus]\n\t[--alignment file]\n\t[--translations count]\n\t[--examples count]\n\t[--html]\n\t[--stdio]\n";
    while(1) {
        static struct option long_options[] = {
            {"load", required_argument, 0, 'l'},
            {"save", required_argument, 0, 's'},
            {"create", required_argument, 0, 'c'},
            {"query", required_argument, 0, 'q'},
            {"target", required_argument, 0, 't'},
            {"alignment", required_argument, 0, 'a'},
            {"html", no_argument, 0, 'h'},
            {"pretty", no_argument, 0, 'p'},
            {"stdio", no_argument, 0, 'i'},
            {"translations", required_argument, 0, 'o'},
            {"examples", required_argument, 0, 'e'},
            {0, 0, 0, 0}
        };
        int option_index = 0;
        int c = getopt_long (argc, argv, "l:s:c:q:Q:t:a:hpio:e:", long_options, &option_index);
        if (c == -1) break;
        switch (c) {
        case 'l':
            fileNameSuffix = string(optarg);
            loadFlag = true;
            break;
        case 't':
            fileNameTarget = string(optarg);
            break;
        case 'a':
            fileNameAlignment = string(optarg);
            break;
        case 's':
            fileNameSuffix = string(optarg);
            saveFlag = true;
            break;
        case 'c':
            fileNameSource = string(optarg);
            createFlag = true;
            break;
        case 'Q':
            query = base64_decode(string(optarg));
            queryFlag = true;
            break;
        case 'q':
            query = string(optarg);
            queryFlag = true;
            break;
        case 'o':
            max_translation = atoi(optarg);
            break;
        case 'e':
            max_example = atoi(optarg);
            break;
        case 'p':
            prettyFlag = true;
            break;
        case 'h':
            htmlFlag = true;
            break;
        case 'i':
            stdioFlag = true;
            break;
        default:
            cerr << info;
            exit(1);
        }
    }
    if (stdioFlag) {
        queryFlag = true;
    }

    // check if parameter settings are legal
    if (saveFlag && !createFlag) {
        cerr << "error: cannot save without creating\n" << info;
        exit(1);
    }
    if (saveFlag && loadFlag) {
        cerr << "error: cannot load and save at the same time\n" << info;
        exit(1);
    }
    if (!loadFlag && !createFlag) {
        cerr << "error: neither load or create - i have no info!\n" << info;
        exit(1);
    }
    if (createFlag && (fileNameTarget == "" || fileNameAlignment == "")) {
        cerr << "error: i have no target corpus or alignment\n" << info;
        exit(1);
    }

    // do your thing
    SuffixArray suffixArray;
    TargetCorpus targetCorpus;
    Alignment alignment;
    if (createFlag) {
        cerr << "will create\n";
        cerr << "source corpus is in " << fileNameSource << endl;
        suffixArray.Create( fileNameSource );
        cerr << "target corpus is in " << fileNameTarget << endl;
        targetCorpus.Create( fileNameTarget );
        cerr << "alignment is in " << fileNameAlignment << endl;
        alignment.Create( fileNameAlignment );
        if (saveFlag) {
            suffixArray.Save( fileNameSuffix );
            targetCorpus.Save( fileNameSuffix );
            alignment.Save( fileNameSuffix );
            cerr << "will save in " << fileNameSuffix << endl;
        }
    }
    if (loadFlag) {
        cerr << "will load from " << fileNameSuffix << endl;
        suffixArray.Load( fileNameSuffix );
        targetCorpus.Load( fileNameSuffix );
        alignment.Load( fileNameSuffix );
    }
    if (stdioFlag) {
        cout << "-|||- BICONCOR START -|||-" << endl << flush;
        while(true) {
            string query;
            if (getline(cin, query, '\n').eof()) {
                return 0;
            }
            vector< string > queryString = alignment.Tokenize( query.c_str() );
            PhrasePairCollection ppCollection( &suffixArray, &targetCorpus, &alignment, max_translation, max_example );
            int total = ppCollection.GetCollection( queryString );
            cout << "TOTAL: " << total << endl;
            if (htmlFlag) {
                ppCollection.PrintHTML();
            }
            else {
                ppCollection.Print(prettyFlag);
            }
            cout << "-|||- BICONCOR END -|||-" << endl << flush;
        }
    }
    else if (queryFlag) {
        cerr << "query is " << query << endl;
        vector< string > queryString = alignment.Tokenize( query.c_str() );
        PhrasePairCollection ppCollection( &suffixArray, &targetCorpus, &alignment, max_translation, max_example );
        ppCollection.GetCollection( queryString );
        if (htmlFlag) {
            ppCollection.PrintHTML();
        }
        else {
            ppCollection.Print(prettyFlag);
        }
    }

    return 0;
}
Пример #24
0
int main(int argc, char* argv[]) {

	if (argc < 2) {
		PrintUsage();
		exit(1);
	}
	int argi = 1;
	string saFile = argv[argi++];
	vector<string> inFiles;
	
	int doBLT = 1;
	int bltPrefixLength = 8;
	int parsingOptions = 0;
	SAType saBuildType = larsson;
	int read4BitCompressed  = 0;
	int diffCoverSize = 0;
	while (argi < argc) {
		if (strlen(argv[argi]) > 0 and
				argv[argi][0] == '-'){ 
			parsingOptions = 1;
		}
		if (!parsingOptions) {
			inFiles.push_back(argv[argi]);
		}
		else {
			if (strcmp(argv[argi], "-blt") == 0) {
				doBLT = 1;
        if (argi < argc - 1) {
          bltPrefixLength = atoi(argv[++argi]);
          if (bltPrefixLength == 0) {
            cout << argv[argi] << " is not a valid lookup table length." << endl;
            exit(1);
          }
        }
        else {
          cout << "Please specify a lookup table length." << endl;
          exit(1);
        }
			}
			else if (strcmp(argv[argi], "-mamy") == 0) {
				saBuildType = manmy;
			}
			else if (strcmp(argv[argi], "-larsson") == 0) {
				saBuildType = larsson;
			}
			else if (strcmp(argv[argi], "-mcilroy") == 0) {
				saBuildType = mcilroy;
			}
			else if (strcmp(argv[argi], "-slow") == 0) {
				saBuildType = slow;
			}
			else if (strcmp(argv[argi], "-kark") == 0) {
				saBuildType = kark;
			}
			else if (strcmp(argv[argi], "-mafe") == 0) {
				saBuildType = mafe;
			}
			else if (strcmp(argv[argi], "-welter") == 0) {
				saBuildType = welter;
			}
			else if (strcmp(argv[argi], "-welterweight") == 0) {
        if (argi < argc-1) {
          diffCoverSize = atoi(argv[++argi]);
        }
        else {
          cout << "Please specify a difference cover size.  Valid values are 7,32,64,111, and 2281.  Larger values use less memory but may be slower." << endl;
          exit(1);
        }
        if ( ! (diffCoverSize == 7 or 
                diffCoverSize == 32 or
                diffCoverSize == 64 or 
                diffCoverSize == 111 or
                diffCoverSize == 2281) ) {
          cout << "The difference cover size must be one of 7,32,64,111, or 2281." << endl;
          cout << "Larger numbers use less space but are more slow." << endl;
          exit(1);
        }
			}
			else if (strcmp(argv[argi], "-4bit") == 0) {
				read4BitCompressed = 1;
			}
			else {
				PrintUsage();
				cout << "ERROR, bad option: " << argv[argi] << endl;
				exit(1);
			}
		}
		++argi;
	}
  
  if (inFiles.size() == 0) {
    //
    // Special use case: the input file is a fasta file.  Write to that file + .sa
    //
    inFiles.push_back(saFile);
    saFile = saFile + ".sa";
  }
  
	VectorIndex inFileIndex;
	FASTASequence seq;
	CompressedSequence<FASTASequence> compSeq;

	if (read4BitCompressed == 0) {
		for (inFileIndex = 0; inFileIndex < inFiles.size(); ++inFileIndex) {
			FASTAReader reader;
			reader.Init(inFiles[inFileIndex]);
			reader.SetSpacePadding(111);
			if (saBuildType == kark) {
				//
				// The Karkkainen sa building method requires a little extra
				// space at the end of the dna sequence so that counting may
				// be done mod 3 without adding extra logic for boundaries.
				//
			}
  
			if (inFileIndex == 0) {
				reader.ReadAllSequencesIntoOne(seq);
				reader.Close();
			}
			else {
				while(reader.ConcatenateNext(seq)) {
					cout << "added " << seq.title << endl;
				}
			}
		}
		seq.ToThreeBit();
		//seq.ToUpper();
	}
	else {
		assert(inFiles.size() == 1);
		cout << "reading compressed sequence." << endl;
		compSeq.Read(inFiles[0]);
		seq.seq = compSeq.seq;
		seq.length = compSeq.length;
		compSeq.RemoveCompressionCounts();
		cout << "done." << endl;
	}

  //
  // For now, do not allow creation of suffix arrays on sequences > 4G.
  //
  if (seq.length >= UINT_MAX) {
    cout << "ERROR, references greater than " << UINT_MAX << " bases are not supported." << endl;
    cout << "Consider breaking the reference into multiple files, running alignment. " << endl;
    cout << "against each file, and merging the result." << endl;
    exit(1);
  }
	vector<int> alphabet;
	
	SuffixArray<Nucleotide, vector<int> >  sa;
	//	sa.InitTwoBitDNAAlphabet(alphabet);
	//	sa.InitAsciiCharDNAAlphabet(alphabet);
  sa.InitThreeBitDNAAlphabet(alphabet);

	if (saBuildType == manmy) {
		sa.MMBuildSuffixArray(seq.seq, seq.length, alphabet);
	}
	else if (saBuildType == mcilroy) {
		sa.index = new SAIndex[seq.length+1];
		DNALength i;
		for (i = 0; i < seq.length; i++) { sa.index[i] = seq.seq[i] + 1;}
		sa.index[seq.length] = 0;
		ssort(sa.index, NULL);
		for (i = 1; i < seq.length+1; i++ ){ sa.index[i-1] = sa.index[i];};
		sa.length = seq.length;
	}
	else if (saBuildType == larsson) {
		sa.LarssonBuildSuffixArray(seq.seq, seq.length, alphabet);
	}
	else if (saBuildType == kark) {
		sa.index = new SAIndex[seq.length];
		seq.ToThreeBit();
		DNALength p;
		for (p = 0; p < seq.length; p++ ){ seq.seq[p]++; }
		KarkkainenBuildSuffixArray<Nucleotide>(seq.seq, sa.index, seq.length, 5);
		sa.length = seq.length;
	}
	else if (saBuildType == mafe) {
		//		sa.MaFeBuildSuffixArray(seq.seq, seq.length);
		
	}
	else if (saBuildType == welter) {
		if (diffCoverSize == 0) {
			sa.LightweightBuildSuffixArray(seq.seq, seq.length);
		}
		else {
			sa.LightweightBuildSuffixArray(seq.seq, seq.length, diffCoverSize);
		}
	}
	if (doBLT) {
		sa.BuildLookupTable(seq.seq, seq.length, bltPrefixLength);
	}
	sa.Write(saFile);

	return 0;

}
Пример #25
0
void cluster()
{
    BWT* pBWT = new BWT(opt::prefix + BWT_EXT);
    BWT* pRBWT = new BWT(opt::prefix + RBWT_EXT);
    OverlapAlgorithm* pOverlapper = new OverlapAlgorithm(pBWT, pRBWT,opt::errorRate, opt::seedLength, opt::seedStride, true);

    pOverlapper->setExactModeOverlap(opt::errorRate < 0.001f);
    pOverlapper->setExactModeIrreducible(opt::errorRate < 0.001f);

    BitVector markedReads(pBWT->getNumStrings());

    std::string preclustersFile = opt::outFile + ".preclusters";
    std::ostream* pPreWriter = createWriter(preclustersFile);
    ClusterPostProcess postProcessor(pPreWriter, opt::minSize, &markedReads);
    
    // Set the cluster parameters
    ClusterParameters parameters;
    parameters.pOverlapper = pOverlapper;
    parameters.minOverlap = opt::minOverlap;
    parameters.maxClusterSize = opt::maxSize;
    parameters.maxIterations = opt::maxIterations;
    parameters.pMarkedReads = &markedReads;

    // Read the limit kmer sequences, if provided
    std::set<std::string>* pLimitKmers = NULL;

    if(!opt::limitFile.empty())
    {
        // Read in the limit sequences
        pLimitKmers = new std::set<std::string>;
        readLimitKmers(pLimitKmers);
        parameters.pLimitKmers = pLimitKmers;
        parameters.limitK = opt::limitKmer;
    }
    else
    {
        parameters.pLimitKmers = NULL;
        parameters.limitK = 0;
    }

    // Make pre-clusters from the reads
    if(opt::numThreads <= 1)
    {
        printf("[%s] starting serial-mode read clustering\n", PROGRAM_IDENT);
        ClusterProcess processor(parameters);
        
        // If the extend file is empty, build new clusters
        if(opt::extendFile.empty())
        {
            PROCESS_CLUSTER_SERIAL(opt::readsFile, &processor, &postProcessor);
        }
        else
        {
            // Process a set of preexisting clusters
            ClusterReader clusterReader(opt::extendFile);
            PROCESS_EXTEND_SERIAL(clusterReader, &processor, &postProcessor);
        }
    }
    else
    {
        printf("[%s] starting parallel-mode read clustering computation with %d threads\n", PROGRAM_IDENT, opt::numThreads);
        
        std::vector<ClusterProcess*> processorVector;
        for(int i = 0; i < opt::numThreads; ++i)
        {
            ClusterProcess* pProcessor = new ClusterProcess(parameters);
            processorVector.push_back(pProcessor);
        }
        
        if(opt::extendFile.empty())
        {
            PROCESS_CLUSTER_PARALLEL(opt::readsFile, processorVector, &postProcessor);
        }
        else
        {
            ClusterReader clusterReader(opt::extendFile);
            PROCESS_EXTEND_PARALLEL(clusterReader, processorVector, &postProcessor);
        }
        
        for(size_t i = 0; i < processorVector.size(); ++i)
        {
            delete processorVector[i];
            processorVector[i] = NULL;
        }
    }
    delete pPreWriter;
    delete pBWT;
    delete pRBWT;
    delete pOverlapper;

    // Deallocate limit kmers
    if(pLimitKmers != NULL)
        delete pLimitKmers;

    // Open the preclusters file and convert them to read names
    SuffixArray* pFwdSAI = new SuffixArray(opt::prefix + SAI_EXT);
    ReadInfoTable* pRIT = new ReadInfoTable(opt::readsFile, pFwdSAI->getNumStrings());

    size_t seedIdx = 0;
    std::istream* pPreReader = createReader(preclustersFile);
    std::ostream* pClusterWriter = createWriter(opt::outFile);
    std::string line;
    while(getline(*pPreReader,line))
    {
        std::stringstream parser(line);
        std::string clusterName;
        std::string readSequence;
        size_t clusterSize;
        int64_t lowIdx;
        int64_t highIdx;
        parser >> clusterName >> clusterSize >> readSequence >> lowIdx >> highIdx;

        if(lowIdx > highIdx)
        {
            // This is an extra read that is not present in the FM-index
            // Output a record with a fake read ID
            *pClusterWriter << clusterName << "\t" << clusterSize << "\tseed-" << seedIdx++ << "\t" << readSequence << "\n";
        }
        else
        {
            for(int64_t i = lowIdx; i <= highIdx; ++i)
            {
                const ReadInfo& targetInfo = pRIT->getReadInfo(pFwdSAI->get(i).getID());
                std::string readName = targetInfo.id;
                *pClusterWriter << clusterName << "\t" << clusterSize << "\t" << readName << "\t" << readSequence << "\n";
            }
        }
    }
    unlink(preclustersFile.c_str());

    delete pFwdSAI;
    delete pRIT;
    delete pPreReader;
    delete pClusterWriter;
}
Пример #26
0
// Compute the initial BWTs for the input file split into blocks of records using the SAIS algorithm
MergeVector computeInitialSAIS(const BWTDiskParameters& parameters)
{
    SeqReader* pReader = new SeqReader(parameters.inFile);
    SeqRecord record;

    int groupID = 0;
    size_t numReadTotal = 0;

    MergeVector mergeVector;
    MergeItem mergeItem;
    mergeItem.start_index = 0;

    // Phase 1: Compute the initial BWTs
    ReadTable* pCurrRT = new ReadTable;
    bool done = false;
    while(!done)
    {
        done = !pReader->get(record);

        if(!done)
        {
            // the read is valid
            SeqItem item = record.toSeqItem();
            if(parameters.bBuildReverse)
                item.seq.reverse();
            pCurrRT->addRead(item);
            ++numReadTotal;
        }

        if(pCurrRT->getCount() >= parameters.numReadsPerBatch || (done && pCurrRT->getCount() > 0))
        {
            // Compute the SA and BWT for this group
            SuffixArray* pSA = new SuffixArray(pCurrRT, 1);

            // Write the BWT to disk                
            std::string bwt_temp_filename = makeTempName(parameters.outPrefix, groupID, parameters.bwtExtension);
            pSA->writeBWT(bwt_temp_filename, pCurrRT);

            std::string sai_temp_filename = makeTempName(parameters.outPrefix, groupID, parameters.saiExtension);
            pSA->writeIndex(sai_temp_filename);

            // Push the merge info
            mergeItem.end_index = numReadTotal - 1; // inclusive
            mergeItem.reads_filename = parameters.inFile;
            mergeItem.bwt_filename = bwt_temp_filename;
            mergeItem.sai_filename = sai_temp_filename;
            mergeVector.push_back(mergeItem);

            // Cleanup
            delete pSA;

            // Start the new group
            mergeItem.start_index = numReadTotal;
            ++groupID;
            pCurrRT->clear();
        }
    }
    delete pCurrRT;
    delete pReader;
    return mergeVector;
}