/*ednm = spls[1].replace('\"',"_").strip() ednm = ednm.replace("\'","_") ednm = ednm.replace("\\","_") ednm = ednm.replace("/","_") ednm = ednm.replace("(","_") ednm = ednm.replace(")","_") ednm = ednm.replace(".","_") ednm = ednm.replace("&","_") ednm = ednm.replace(",","_") ednm = ednm.replace(" ","_") */ void SQLiteDBController::load_seqs(string div,bool downl) { cout << "loading taxonomy" << endl; if (downl == true) { const char * cmd = "wget ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz"; cout << "downloading with wget" << endl; system(cmd); cmd = "tar -xzvf taxdump.tar.gz"; cout << "untaring" << endl; system(cmd); } //read the nodes.dmp map<string,string> rank; map<string,string> parent_id; ifstream infile("nodes.dmp",ios::in); string line; vector<string> tokens; while(getline(infile,line)) { string del("|"); tokens.clear(); Tokenize(line, tokens, del); if(tokens.size() > 1) { for(int i=0; i<tokens.size(); i++) { TrimSpaces(tokens[i]); } string ncbi_id = tokens[0]; rank[ncbi_id] = tokens[2]; parent_id[ncbi_id] = tokens[1]; } } infile.close(); //read the names.dmp ifstream infile2 ("names.dmp",ios::in); sqlite3 *conn; int rc = sqlite3_open(db_name.c_str(), &conn); char *zErrMsg = 0; sqlite3_exec(conn, "BEGIN TRANSACTION", NULL, NULL, NULL); count = 0; while(getline(infile2,line)) { if (count % 100000 == 0) { cout << count << endl; } string del("|"); tokens.clear(); Tokenize(line, tokens, del); if(tokens.size() > 1) { for(int i=0; i<tokens.size(); i++) { TrimSpaces(tokens[i]); } string gin = tokens[0]; string nm = create_name(tokens[1]);//need to double quote the single quotes and backslash the quotes string nm_class = tokens[3]; string ednm = create_edited_name(tokens[1]);//need to edit the names string sql = "insert into taxonomy (ncbi_id,name,name_class,node_rank,parent_ncbi_id,edited_name) values ("; sql += gin+",\""; sql += nm+"\",'"; sql += nm_class+"','"; sql += rank[gin]+"',"; sql += parent_id[gin]+",'"; sql += ednm+"');"; //query.execute(sql); rc = sqlite3_exec(conn, sql.c_str(), 0, 0, 0); //uncomment to get the names that don't commit, mostly bad quotes // if (rc != 0) // cout << sql << endl; } count += 1; } sqlite3_exec(conn, "COMMIT TRANSACTION", NULL, NULL, NULL); infile2.close(); sqlite3_close(conn); cout << "updating left/right values" << endl; Database cppconn(db_name); Query query(cppconn); string cmd = "select ncbi_id,parent_ncbi_id from taxonomy where name_class = 'scientific name';"; query.get_result(cmd); while(query.fetch_row()) { int nc = query.getval(); int pc = query.getval(); if(parent_ncbi_map.count(pc) > 0) { parent_ncbi_map[pc].push_back(nc); } else { vector<int> tv; tv.push_back(nc); parent_ncbi_map[pc] = tv; } } //remove files remove("citations.dmp"); remove("division.dmp"); remove("gc.prt"); remove("gencode.dmp"); remove("readme.txt"); //get the root and send to rebuild count = 0; rc = sqlite3_open(db_name.c_str(), &conn); sqlite3_exec(conn, "BEGIN TRANSACTION", NULL, NULL, NULL); rebuild_tree(1,1,conn); sqlite3_exec(conn, "COMMIT TRANSACTION", NULL, NULL, NULL); sqlite3_close(conn); cout << "loading seqs" << endl; division = div; vector<string> runall; if(division == "met" || division == "all") { runall.push_back("pri"); runall.push_back("rod"); runall.push_back("mam"); runall.push_back("vrt"); runall.push_back("inv"); if (division == "all") runall.push_back("pln"); runall.push_back("bct"); } else { runall.push_back(division); } if (downl == true) { string cmd; for(int i = 0; i<runall.size(); i++) { cmd = "wget ftp://ftp.ncbi.nih.gov/genbank/gb"+runall[i]+"*.seq.gz"; cout << "downloading with wget" << endl; system(cmd.c_str()); cmd = "gunzip -d gb"+runall[i]+"*.seq.gz"; cout << "uncompressing" << endl; system(cmd.c_str()); } } else { for(int i = 0; i<runall.size(); i++) { cmd = "gunzip -d gb"+runall[i]+"*.seq.gz"; cout << "uncompressing" << endl; system(cmd.c_str()); } } vector<string> file_names; cout << "getting file names" << endl; getdir(".",file_names); for(int i=0; i<file_names.size(); i++) { for(int j=0; j<runall.size(); j++) { if(file_names[i].find("gb"+runall[j]) != string::npos && file_names[i].substr(file_names[i].size()-4,4)==".seq") { string filen = file_names[i]; cout << filen << endl; GenBankReader gbr; gbr.parse_file(filen,db_name); remove(filen.c_str()); } } } cout << "merging old names with new names" << endl; ifstream infile3 ("merged.dmp",ios::in); rc = sqlite3_open(db_name.c_str(), &conn); sqlite3_exec(conn, "BEGIN TRANSACTION", NULL, NULL, NULL); count = 0; while(getline(infile3,line)) { if (count % 100000 == 0) { cout << count << endl; } string del("|"); tokens.clear(); Tokenize(line, tokens, del); for(int i = 0; i<tokens.size(); i++) { TrimSpaces(tokens[i]); } if(tokens.size() > 1) { string sql = "update sequence set ncbi_id = "; sql += tokens[1]; sql += " where ncbi_id = "; sql += tokens[0]; sql += ";"; rc = sqlite3_exec(conn, sql.c_str(), 0, 0, 0); if (rc != 0) cout << sql << endl; } count += 1; } infile3.close(); sqlite3_exec(conn, "COMMIT TRANSACTION", NULL, NULL, NULL); sqlite3_close(conn); remove("merged.dmp"); remove("names.dmp"); remove("nodes.dmp"); remove("delnodes.dmp"); cout << "finished loading" << endl; }
/*ednm = spls[1].replace('\"',"_").strip() ednm = ednm.replace("\'","_") ednm = ednm.replace("\\","_") ednm = ednm.replace("/","_") ednm = ednm.replace("(","_") ednm = ednm.replace(")","_") ednm = ednm.replace(".","_") ednm = ednm.replace("&","_") ednm = ednm.replace(",","_") ednm = ednm.replace(" ","_") */ void SQLiteDBController::load_seqs(string div, string ref, bool downl) { cout << "loading taxonomy" << endl; if (downl == true) { const char * cmd = "wget ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz"; cout << "downloading with wget" << endl; system(cmd); cmd = "tar -xzvf taxdump.tar.gz"; cout << "untaring" << endl; system(cmd); } //read the nodes.dmp map<string, string> rank; map<string, string> parent_id; ifstream infile("nodes.dmp", ios::in); string line; vector<string> tokens; while (getline(infile, line)) { string del("|"); tokens.clear(); Tokenize(line, tokens, del); if (tokens.size() > 1) { for (int i = 0; i < tokens.size(); i++) { TrimSpaces(tokens[i]); } string ncbi_id = tokens[0]; rank[ncbi_id] = tokens[2]; parent_id[ncbi_id] = tokens[1]; } } infile.close(); //read the names.dmp ifstream infile2("names.dmp", ios::in); sqlite3 *conn; int rc = sqlite3_open(db_name.c_str(), &conn); char *zErrMsg = 0; sqlite3_exec(conn, "BEGIN TRANSACTION", NULL, NULL, NULL); count = 0; while (getline(infile2, line)) { if (count % 100000 == 0) { cout << count << endl; } string del("|"); tokens.clear(); Tokenize(line, tokens, del); if (tokens.size() > 1) { for (int i = 0; i < tokens.size(); i++) { TrimSpaces(tokens[i]); } string gin = tokens[0]; string nm = create_name(tokens[1]); //need to double quote the single quotes and backslash the quotes string nm_class = tokens[3]; string ednm = create_edited_name(tokens[1]); //need to edit the names string sql = "insert into taxonomy (ncbi_id,name,name_class,node_rank,parent_ncbi_id,edited_name) values ("; sql += gin + ",\""; sql += nm + "\",'"; sql += nm_class + "','"; sql += rank[gin] + "',"; sql += parent_id[gin] + ",'"; sql += ednm + "');"; //query.execute(sql); rc = sqlite3_exec(conn, sql.c_str(), 0, 0, 0); //uncomment to get the names that don't commit, mostly bad quotes // if (rc != 0) // cout << sql << endl; } count += 1; } sqlite3_exec(conn, "COMMIT TRANSACTION", NULL, NULL, NULL); infile2.close(); sqlite3_close(conn); cout << "updating left/right values" << endl; Database cppconn(db_name); Query query(cppconn); string cmd = "select ncbi_id,parent_ncbi_id from taxonomy where name_class = 'scientific name';"; query.get_result(cmd); while (query.fetch_row()) { int nc = query.getval(); int pc = query.getval(); if (parent_ncbi_map.count(pc) > 0) { parent_ncbi_map[pc].push_back(nc); } else { vector<int> tv; tv.push_back(nc); parent_ncbi_map[pc] = tv; } } //remove files remove("citations.dmp"); remove("division.dmp"); remove("gc.prt"); remove("gencode.dmp"); remove("readme.txt"); //get the root and send to rebuild count = 0; rc = sqlite3_open(db_name.c_str(), &conn); sqlite3_exec(conn, "BEGIN TRANSACTION", NULL, NULL, NULL); rebuild_tree(1, 1, conn); sqlite3_exec(conn, "COMMIT TRANSACTION", NULL, NULL, NULL); sqlite3_close(conn); cout << "loading seqs" << endl; vector<string> filesToProcess; if (div.length() > 0) { division = div; vector<string> groups; // taxonomic groups to be downloaded if (division == "met" || division == "all") { groups.push_back("pri"); groups.push_back("rod"); groups.push_back("mam"); groups.push_back("vrt"); groups.push_back("inv"); groups.push_back("bct"); if (division == "all") groups.push_back("pln"); } else { groups.push_back(division); } for (int i = 0; i < groups.size(); i++) { string cmd; string fnameString = "gb" + groups[i] + "*.seq.gz"; if (downl == true) { cmd = "wget ftp://ftp.ncbi.nih.gov/genbank/" + fnameString ; cout << "downloading with wget" << endl; system(cmd.c_str()); cmd = "gunzip -d gb" + groups[i] + "*.seq.gz"; cout << "uncompressing" << endl; system(cmd.c_str()); // } } else { // for (int i = 0; i < groups.size(); i++) { // cmd = "gunzip -d gb" + groups[i] + "*.seq.gz"; cmd = "gunzip -d " + fnameString; cout << "uncompressing" << endl; system(cmd.c_str()); } } // download daily updates if (downl == true) { string fnameString = "nc*.flat.gz"; string cmd = "wget -nv ftp://ftp.ncbi.nih.gov/genbank/daily-nc/" + fnameString; cout << "downloading dailies with wget" << endl; system(cmd.c_str()); cmd = "gunzip -d " + fnameString; cout << "uncompressing dailies" << endl; system(cmd.c_str()); } // get the names of the files to use vector<string> file_names; cout << "getting file names for gb flat files" << endl; getdir(".", file_names); for (int i = 0; i < file_names.size(); i++) { for (int j = 0; j < groups.size(); j++) { if (file_names[i].find("gb" + groups[j]) != string::npos && file_names[i].substr(file_names[i].size() - 4, 4) == ".seq") { filesToProcess.push_back(file_names[i]); } else if (file_names[i].find("nc") != string::npos && file_names[i].substr(file_names[i].size() - 5, 5) == ".flat") { filesToProcess.push_back(file_names[i]); } } } } if (ref.length() > 0) { // if we're getting whole genomes refseq = ref; vector<string> groups; // taxonomic groups to be downloaded if (refseq == "metazoan" || refseq == "all") { groups.push_back("mitochondrion"); groups.push_back("invertebrate"); groups.push_back("vertebrate-mammalian"); groups.push_back("vertebrate-other"); } else if (refseq == "plant" || refseq == "all") { groups.push_back("plant"); groups.push_back("plastid"); } else if (refseq == "microbes" || refseq == "all") { groups.push_back("microbial"); groups.push_back("plasmid"); groups.push_back("protozoa"); } if (refseq == "all") { groups.push_back("fungi"); groups.push_back("viral"); } else { groups.push_back(refseq); } for (int i = 0; i < groups.size(); i++) { string cmd; string fnameString = groups[i] + "*.genomic.gbff.gz"; if (downl == true) { cout << "downloading with wget" << endl; cmd = "wget ftp://ftp.ncbi.nih.gov/refseq/release/" + groups[i] + "/" + fnameString; // ftp://ftp.ncbi.nih.gov/refseq/release/mitochondrion/mitochondrion.1.1.genomic.fna.gz system(cmd.c_str()); cmd = "gunzip -d " + fnameString; cout << "uncompressing" << endl; system(cmd.c_str()); //} } else { // for (int i = 0; i < groups.size(); i++) { cmd = "gunzip -d " + fnameString; cout << "uncompressing" << endl; system(cmd.c_str()); } } // get the names of the files to use vector<string> file_names; cout << "getting file names for refseq flat files" << endl; getdir(".", file_names); for (int i = 0; i < file_names.size(); i++) { for (int j = 0; j < groups.size(); j++) { if (file_names[i].find(groups[j]) != string::npos && file_names[i].substr(file_names[i].size() - 5, 5) == ".gbff") { filesToProcess.push_back(file_names[i]); // cout << filen << endl; // GenBankReader gbr; // gbr.parse_file(filen, db_name); // remove(filen.c_str()); } } } } // now process the files into the db for (int i = 0; i < filesToProcess.size(); i++) { cout << filesToProcess[i] << endl; GenBankReader gbr; gbr.parse_file(filesToProcess[i], db_name); remove(filesToProcess[i].c_str()); } cout << "merging old names with new names" << endl; ifstream infile3("merged.dmp", ios::in); rc = sqlite3_open(db_name.c_str(), &conn); sqlite3_exec(conn, "BEGIN TRANSACTION", NULL, NULL, NULL); count = 0; while (getline(infile3, line)) { if (count % 100000 == 0) { cout << count << endl; } string del("|"); tokens.clear(); Tokenize(line, tokens, del); for (int i = 0; i < tokens.size(); i++) { TrimSpaces(tokens[i]); } if (tokens.size() > 1) { string sql = "update sequence set ncbi_id = "; sql += tokens[1]; sql += " where ncbi_id = "; sql += tokens[0]; sql += ";"; rc = sqlite3_exec(conn, sql.c_str(), 0, 0, 0); if (rc != 0) cout << sql << endl; } count += 1; } infile3.close(); sqlite3_exec(conn, "COMMIT TRANSACTION", NULL, NULL, NULL); sqlite3_close(conn); remove("merged.dmp"); remove("names.dmp"); remove("nodes.dmp"); remove("delnodes.dmp"); cout << "finished loading" << endl; }