示例#1
0
/*ednm = spls[1].replace('\"',"_").strip()
		ednm = ednm.replace("\'","_")
		ednm = ednm.replace("\\","_")
		ednm = ednm.replace("/","_")
		ednm = ednm.replace("(","_")
		ednm = ednm.replace(")","_")
		ednm = ednm.replace(".","_")
		ednm = ednm.replace("&","_")
		ednm = ednm.replace(",","_")
		ednm = ednm.replace(" ","_")
*/
void SQLiteDBController::load_seqs(string div,bool downl) {
    cout << "loading taxonomy" << endl;
    if (downl == true) {
        const char * cmd = "wget ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz";
        cout << "downloading with wget" << endl;
        system(cmd);
        cmd = "tar -xzvf taxdump.tar.gz";
        cout << "untaring" << endl;
        system(cmd);
    }
    //read the nodes.dmp
    map<string,string> rank;
    map<string,string> parent_id;
    ifstream infile("nodes.dmp",ios::in);
    string line;
    vector<string> tokens;
    while(getline(infile,line)) {
        string del("|");
        tokens.clear();
        Tokenize(line, tokens, del);
        if(tokens.size() > 1) {
            for(int i=0; i<tokens.size(); i++) {
                TrimSpaces(tokens[i]);
            }
            string ncbi_id = tokens[0];
            rank[ncbi_id] = tokens[2];
            parent_id[ncbi_id] = tokens[1];
        }
    }
    infile.close();
    //read the names.dmp
    ifstream infile2 ("names.dmp",ios::in);
    sqlite3 *conn;
    int rc = sqlite3_open(db_name.c_str(), &conn);
    char *zErrMsg = 0;

    sqlite3_exec(conn, "BEGIN TRANSACTION", NULL, NULL, NULL);
    count = 0;
    while(getline(infile2,line)) {
        if (count % 100000 == 0) {
            cout << count << endl;
        }
        string del("|");
        tokens.clear();
        Tokenize(line, tokens, del);
        if(tokens.size() > 1) {
            for(int i=0; i<tokens.size(); i++) {
                TrimSpaces(tokens[i]);
            }
            string gin = tokens[0];
            string nm = create_name(tokens[1]);//need to double quote the single quotes and backslash the quotes
            string nm_class = tokens[3];
            string ednm = create_edited_name(tokens[1]);//need to edit the names
            string sql = "insert into taxonomy (ncbi_id,name,name_class,node_rank,parent_ncbi_id,edited_name) values (";
            sql += gin+",\"";
            sql += nm+"\",'";
            sql += nm_class+"','";
            sql += rank[gin]+"',";
            sql += parent_id[gin]+",'";
            sql += ednm+"');";
            //query.execute(sql);
            rc = sqlite3_exec(conn, sql.c_str(), 0, 0, 0);
            //uncomment to get the names that don't commit, mostly bad quotes
//	    if (rc != 0)
//		cout << sql << endl;
        }
        count += 1;
    }
    sqlite3_exec(conn, "COMMIT TRANSACTION", NULL, NULL, NULL);
    infile2.close();
    sqlite3_close(conn);

    cout << "updating left/right values" << endl;
    Database cppconn(db_name);
    Query query(cppconn);
    string cmd = "select ncbi_id,parent_ncbi_id from taxonomy where name_class = 'scientific name';";
    query.get_result(cmd);
    while(query.fetch_row()) {
        int nc = query.getval();
        int pc = query.getval();
        if(parent_ncbi_map.count(pc) > 0) {
            parent_ncbi_map[pc].push_back(nc);
        } else {
            vector<int> tv;
            tv.push_back(nc);
            parent_ncbi_map[pc] = tv;

        }
    }
    //remove files
    remove("citations.dmp");
    remove("division.dmp");
    remove("gc.prt");
    remove("gencode.dmp");
    remove("readme.txt");

    //get the root and send to rebuild
    count = 0;
    rc = sqlite3_open(db_name.c_str(), &conn);
    sqlite3_exec(conn, "BEGIN TRANSACTION", NULL, NULL, NULL);
    rebuild_tree(1,1,conn);
    sqlite3_exec(conn, "COMMIT TRANSACTION", NULL, NULL, NULL);
    sqlite3_close(conn);

    cout << "loading seqs" << endl;
    division = div;
    vector<string> runall;
    if(division == "met" || division == "all") {
        runall.push_back("pri");
        runall.push_back("rod");
        runall.push_back("mam");
        runall.push_back("vrt");
        runall.push_back("inv");
        if (division == "all")
            runall.push_back("pln");
        runall.push_back("bct");
    } else {
        runall.push_back(division);
    }
    if (downl == true) {
        string cmd;
        for(int i = 0; i<runall.size(); i++) {
            cmd = "wget ftp://ftp.ncbi.nih.gov/genbank/gb"+runall[i]+"*.seq.gz";
            cout << "downloading with wget" << endl;
            system(cmd.c_str());
            cmd = "gunzip -d gb"+runall[i]+"*.seq.gz";
            cout << "uncompressing" << endl;
            system(cmd.c_str());
        }
    } else {
        for(int i = 0; i<runall.size(); i++) {
            cmd = "gunzip -d gb"+runall[i]+"*.seq.gz";
            cout << "uncompressing" << endl;
            system(cmd.c_str());
        }
    }
    vector<string> file_names;
    cout << "getting file names" << endl;
    getdir(".",file_names);
    for(int i=0; i<file_names.size(); i++) {
        for(int j=0; j<runall.size(); j++) {
            if(file_names[i].find("gb"+runall[j]) != string::npos && file_names[i].substr(file_names[i].size()-4,4)==".seq") {
                string filen = file_names[i];
                cout << filen << endl;
                GenBankReader gbr;
                gbr.parse_file(filen,db_name);
                remove(filen.c_str());
            }
        }
    }

    cout << "merging old names with new names" << endl;
    ifstream infile3 ("merged.dmp",ios::in);
    rc = sqlite3_open(db_name.c_str(), &conn);
    sqlite3_exec(conn, "BEGIN TRANSACTION", NULL, NULL, NULL);
    count = 0;
    while(getline(infile3,line)) {
        if (count % 100000 == 0) {
            cout << count << endl;
        }
        string del("|");
        tokens.clear();
        Tokenize(line, tokens, del);
        for(int i = 0; i<tokens.size(); i++) {
            TrimSpaces(tokens[i]);
        }
        if(tokens.size() > 1) {
            string sql = "update sequence set ncbi_id = ";
            sql += tokens[1];
            sql += " where ncbi_id = ";
            sql += tokens[0];
            sql += ";";
            rc = sqlite3_exec(conn, sql.c_str(), 0, 0, 0);
            if (rc != 0)
                cout << sql << endl;
        }
        count += 1;
    }
    infile3.close();
    sqlite3_exec(conn, "COMMIT TRANSACTION", NULL, NULL, NULL);
    sqlite3_close(conn);

    remove("merged.dmp");
    remove("names.dmp");
    remove("nodes.dmp");
    remove("delnodes.dmp");
    cout << "finished loading" << endl;
}
示例#2
0
/*ednm = spls[1].replace('\"',"_").strip()
 ednm = ednm.replace("\'","_")
 ednm = ednm.replace("\\","_")
 ednm = ednm.replace("/","_")
 ednm = ednm.replace("(","_")
 ednm = ednm.replace(")","_")
 ednm = ednm.replace(".","_")
 ednm = ednm.replace("&","_")
 ednm = ednm.replace(",","_")
 ednm = ednm.replace(" ","_")
 */
void SQLiteDBController::load_seqs(string div, string ref, bool downl) {
	cout << "loading taxonomy" << endl;
	if (downl == true) {
		const char * cmd = "wget ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz";
		cout << "downloading with wget" << endl;
		system(cmd);
		cmd = "tar -xzvf taxdump.tar.gz";
		cout << "untaring" << endl;
		system(cmd);
	}
	//read the nodes.dmp
	map<string, string> rank;
	map<string, string> parent_id;
	ifstream infile("nodes.dmp", ios::in);
	string line;
	vector<string> tokens;
	while (getline(infile, line)) {
		string del("|");
		tokens.clear();
		Tokenize(line, tokens, del);
		if (tokens.size() > 1) {
			for (int i = 0; i < tokens.size(); i++) {
				TrimSpaces(tokens[i]);
			}
			string ncbi_id = tokens[0];
			rank[ncbi_id] = tokens[2];
			parent_id[ncbi_id] = tokens[1];
		}
	}
	infile.close();
	//read the names.dmp
	ifstream infile2("names.dmp", ios::in);
	sqlite3 *conn;
	int rc = sqlite3_open(db_name.c_str(), &conn);
	char *zErrMsg = 0;

	sqlite3_exec(conn, "BEGIN TRANSACTION", NULL, NULL, NULL);
	count = 0;
	while (getline(infile2, line)) {
		if (count % 100000 == 0) {
			cout << count << endl;
		}
		string del("|");
		tokens.clear();
		Tokenize(line, tokens, del);
		if (tokens.size() > 1) {
			for (int i = 0; i < tokens.size(); i++) {
				TrimSpaces(tokens[i]);
			}
			string gin = tokens[0];
			string nm = create_name(tokens[1]); //need to double quote the single quotes and backslash the quotes
			string nm_class = tokens[3];
			string ednm = create_edited_name(tokens[1]); //need to edit the names
			string sql = "insert into taxonomy (ncbi_id,name,name_class,node_rank,parent_ncbi_id,edited_name) values (";
			sql += gin + ",\"";
			sql += nm + "\",'";
			sql += nm_class + "','";
			sql += rank[gin] + "',";
			sql += parent_id[gin] + ",'";
			sql += ednm + "');";
			//query.execute(sql);
			rc = sqlite3_exec(conn, sql.c_str(), 0, 0, 0);
			//uncomment to get the names that don't commit, mostly bad quotes
//	    if (rc != 0)
//		cout << sql << endl;
		}
		count += 1;
	}
	sqlite3_exec(conn, "COMMIT TRANSACTION", NULL, NULL, NULL);
	infile2.close();
	sqlite3_close(conn);

	cout << "updating left/right values" << endl;
	Database cppconn(db_name);
	Query query(cppconn);
	string cmd = "select ncbi_id,parent_ncbi_id from taxonomy where name_class = 'scientific name';";
	query.get_result(cmd);
	while (query.fetch_row()) {
		int nc = query.getval();
		int pc = query.getval();
		if (parent_ncbi_map.count(pc) > 0) {
			parent_ncbi_map[pc].push_back(nc);
		} else {
			vector<int> tv;
			tv.push_back(nc);
			parent_ncbi_map[pc] = tv;

		}
	}
	//remove files
	remove("citations.dmp");
	remove("division.dmp");
	remove("gc.prt");
	remove("gencode.dmp");
	remove("readme.txt");

	//get the root and send to rebuild
	count = 0;
	rc = sqlite3_open(db_name.c_str(), &conn);
	sqlite3_exec(conn, "BEGIN TRANSACTION", NULL, NULL, NULL);
	rebuild_tree(1, 1, conn);
	sqlite3_exec(conn, "COMMIT TRANSACTION", NULL, NULL, NULL);
	sqlite3_close(conn);

	cout << "loading seqs" << endl;
	vector<string> filesToProcess;

	if (div.length() > 0) {
		division = div;
		vector<string> groups; // taxonomic groups to be downloaded
		if (division == "met" || division == "all") {
			groups.push_back("pri");
			groups.push_back("rod");
			groups.push_back("mam");
			groups.push_back("vrt");
			groups.push_back("inv");
			groups.push_back("bct");

			if (division == "all")
				groups.push_back("pln");

		} else {
			groups.push_back(division);
		}

		for (int i = 0; i < groups.size(); i++) {
			string cmd;
			string fnameString = "gb" + groups[i] + "*.seq.gz";
			if (downl == true) {
				cmd = "wget ftp://ftp.ncbi.nih.gov/genbank/" + fnameString ;
				cout << "downloading with wget" << endl;
				system(cmd.c_str());
				cmd = "gunzip -d gb" + groups[i] + "*.seq.gz";
				cout << "uncompressing" << endl;
				system(cmd.c_str());
//			}
			} else {
//			for (int i = 0; i < groups.size(); i++) {
//				cmd = "gunzip -d gb" + groups[i] + "*.seq.gz";
				cmd = "gunzip -d " + fnameString;
				cout << "uncompressing" << endl;
				system(cmd.c_str());
			}
		}

		// download daily updates
		if (downl == true) {
			string fnameString = "nc*.flat.gz";
			string cmd = "wget -nv ftp://ftp.ncbi.nih.gov/genbank/daily-nc/" + fnameString;
			cout << "downloading dailies with wget" << endl;
			system(cmd.c_str());
			cmd = "gunzip -d " + fnameString;
			cout << "uncompressing dailies" << endl;
			system(cmd.c_str());
		}

		// get the names of the files to use
		vector<string> file_names;
		cout << "getting file names for gb flat files" << endl;
		getdir(".", file_names);
		for (int i = 0; i < file_names.size(); i++) {
			for (int j = 0; j < groups.size(); j++) {
				if (file_names[i].find("gb" + groups[j]) != string::npos && file_names[i].substr(file_names[i].size() - 4, 4) == ".seq") {
					filesToProcess.push_back(file_names[i]);
				} else if (file_names[i].find("nc") != string::npos && file_names[i].substr(file_names[i].size() - 5, 5) == ".flat") {
					filesToProcess.push_back(file_names[i]);
				}
			}
		}
	}

	if (ref.length() > 0) { // if we're getting whole genomes

		refseq = ref;
		vector<string> groups; // taxonomic groups to be downloaded
		if (refseq == "metazoan" || refseq == "all") {
			groups.push_back("mitochondrion");
			groups.push_back("invertebrate");
			groups.push_back("vertebrate-mammalian");
			groups.push_back("vertebrate-other");

		} else if (refseq == "plant" || refseq == "all") {
			groups.push_back("plant");
			groups.push_back("plastid");

		} else if (refseq == "microbes" || refseq == "all") {
			groups.push_back("microbial");
			groups.push_back("plasmid");
			groups.push_back("protozoa");
		}

		if (refseq == "all") {
			groups.push_back("fungi");
			groups.push_back("viral");

		} else {
			groups.push_back(refseq);
		}

		for (int i = 0; i < groups.size(); i++) {
			string cmd;
			string fnameString = groups[i] + "*.genomic.gbff.gz";
			if (downl == true) {
				cout << "downloading with wget" << endl;
				cmd = "wget ftp://ftp.ncbi.nih.gov/refseq/release/" + groups[i] + "/" + fnameString; // ftp://ftp.ncbi.nih.gov/refseq/release/mitochondrion/mitochondrion.1.1.genomic.fna.gz
				system(cmd.c_str());
				cmd = "gunzip -d " + fnameString;
				cout << "uncompressing" << endl;
				system(cmd.c_str());
			//}
			} else {
//				for (int i = 0; i < groups.size(); i++) {
				cmd = "gunzip -d " + fnameString;
				cout << "uncompressing" << endl;
				system(cmd.c_str());
			}
		}

		// get the names of the files to use
		vector<string> file_names;
		cout << "getting file names for refseq flat files" << endl;
		getdir(".", file_names);
		for (int i = 0; i < file_names.size(); i++) {
			for (int j = 0; j < groups.size(); j++) {
				if (file_names[i].find(groups[j]) != string::npos && file_names[i].substr(file_names[i].size() - 5, 5) == ".gbff") {
					filesToProcess.push_back(file_names[i]);
//					cout << filen << endl;
//					GenBankReader gbr;
//					gbr.parse_file(filen, db_name);
//					remove(filen.c_str());
				}
			}
		}


	}

	// now process the files into the db
	for (int i = 0; i < filesToProcess.size(); i++) {
		cout << filesToProcess[i] << endl;
		GenBankReader gbr;
		gbr.parse_file(filesToProcess[i], db_name);
		remove(filesToProcess[i].c_str());
	}

	cout << "merging old names with new names" << endl;
	ifstream infile3("merged.dmp", ios::in);
	rc = sqlite3_open(db_name.c_str(), &conn);
	sqlite3_exec(conn, "BEGIN TRANSACTION", NULL, NULL, NULL);
	count = 0;
	while (getline(infile3, line)) {
		if (count % 100000 == 0) {
			cout << count << endl;
		}
		string del("|");
		tokens.clear();
		Tokenize(line, tokens, del);
		for (int i = 0; i < tokens.size(); i++) {
			TrimSpaces(tokens[i]);
		}
		if (tokens.size() > 1) {
			string sql = "update sequence set ncbi_id = ";
			sql += tokens[1];
			sql += " where ncbi_id = ";
			sql += tokens[0];
			sql += ";";
			rc = sqlite3_exec(conn, sql.c_str(), 0, 0, 0);
			if (rc != 0)
				cout << sql << endl;
		}
		count += 1;
	}
	infile3.close();
	sqlite3_exec(conn, "COMMIT TRANSACTION", NULL, NULL, NULL);
	sqlite3_close(conn);

	remove("merged.dmp");
	remove("names.dmp");
	remove("nodes.dmp");
	remove("delnodes.dmp");
	cout << "finished loading" << endl;
}