void NGSD::setReportVariants(const QString& filename, const VariantList& variants, QSet<int> selected_indices) { QString ps_id = processedSampleId(filename); //get variant ID for(int i=0; i<variants.count(); ++i) { getQuery().exec("UPDATE detected_variant SET report=" + QString(selected_indices.contains(i) ? "1" : "0" ) + " WHERE processed_sample_id='" + ps_id + "' AND variant_id='" + variantId(variants[i]) + "'"); } }
int NGSD::addColumn(VariantList& variants, QString name, QString description) { variants.annotations().append(VariantAnnotationHeader(name)); for (int i=0; i<variants.count(); ++i) { variants[i].annotations().append(""); } variants.annotationDescriptions().append(VariantAnnotationDescription(name, description)); return variants.annotations().count() - 1; }
void NGSD::annotateSomatic(VariantList& variants, QString filename) { //get sample ids QStringList samples = filename.split('-'); QString ts_id = sampleId(samples[0], false); //check if we could determine the sample if (ts_id=="") { Log::warn("Could not find processed sample in NGSD from name '" + QFileInfo(filename).baseName() + "'. Annotation will be incomplete because processing system could not be determined!"); } //remove all NGSD-specific columns QList<VariantAnnotationHeader> headers = variants.annotations(); foreach(const VariantAnnotationHeader& header, headers) { if (header.name().startsWith("som_ihdb")) { removeColumnIfPresent(variants, header.name(), true); } } //get required column indices int som_ihdb_c_idx = addColumn(variants, "som_ihdb_c", "Somatic variant count within NGSD."); int som_ihdb_p_idx = addColumn(variants, "som_ihdb_p", "Projects with somatic variant in NGSD."); //(re-)annotate the variants for (int i=0; i<variants.count(); ++i) { Variant& v = variants[i]; SqlQuery query = getQuery(); query.exec("SELECT s.id, dsv.processed_sample_id_tumor, p.name FROM detected_somatic_variant as dsv, variant as v, processed_sample ps, sample as s, project as p WHERE ps.project_id=p.id AND dsv.processed_sample_id_tumor=ps.id and dsv.variant_id=v.id AND ps.sample_id=s.id AND s.tumor='1' AND v.chr='"+v.chr().str()+"' AND v.start='"+QString::number(v.start())+"' AND v.end='"+QString::number(v.end())+"' AND v.ref='"+v.ref()+"' AND v.obs='"+v.obs()+"'"); //process variants QMap<QByteArray, int> project_map; QSet<QByteArray> processed_ps_ids; QSet<QByteArray> processed_s_ids; while(query.next()) { QByteArray current_sample = query.value(0).toByteArray(); QByteArray current_ps_id = query.value(1).toByteArray(); QByteArray current_project = query.value(2).toByteArray(); //skip already seen processed samples (there could be several variants because of indel window, but we want to process only one) if (processed_ps_ids.contains(current_ps_id)) continue; processed_ps_ids.insert(current_ps_id); //skip the current sample for general statistics if (current_sample==ts_id) continue; //skip already seen samples for general statistics (there could be several processings of the same sample because of different processing systems or because of experment repeats due to quality issues) if (processed_s_ids.contains(current_sample)) continue; processed_s_ids.insert(current_sample); // count if(!project_map.contains(current_project)) project_map.insert(current_project,0); ++project_map[current_project]; } QByteArray somatic_projects; int somatic_count = 0; QMap<QByteArray, int>::const_iterator j = project_map.constBegin(); while(j!=project_map.constEnd()) { somatic_count += j.value(); somatic_projects += j.key() + ","; ++j; } v.annotations()[som_ihdb_c_idx] = QByteArray::number(somatic_count); v.annotations()[som_ihdb_p_idx] = somatic_projects; } }
void NGSD::annotate(VariantList& variants, QString filename) { initProgress("NGSD annotation", true); //get sample ids QString s_id = sampleId(filename, false); QString ps_id = processedSampleId(filename, false); QString sys_id = getValue("SELECT processing_system_id FROM processed_sample WHERE id='" + processedSampleId(filename, false) + "'").toString(); //check if we could determine the sample bool found_in_db = true; if (s_id=="" || ps_id=="" || sys_id=="") { Log::warn("Could not find processed sample in NGSD by name '" + filename + "'. Annotation will be incomplete because processing system could not be determined!"); found_in_db = false; } //get sample ids that have processed samples with the same processing system (not same sample, variants imported, same processing system, good quality of sample, not tumor) QSet<int> sys_sample_ids; SqlQuery tmp = getQuery(); tmp.exec("SELECT DISTINCT s.id FROM processed_sample as ps, sample s WHERE ps.processing_system_id='" + sys_id + "' AND ps.sample_id=s.id AND s.tumor='0' AND s.quality='good' AND s.id!='" + s_id + "' AND (SELECT count(id) FROM detected_variant as dv WHERE dv.processed_sample_id = ps.id)>0"); while(tmp.next()) { sys_sample_ids.insert(tmp.value(0).toInt()); } //remove all NGSD-specific columns QList<VariantAnnotationHeader> headers = variants.annotations(); foreach(const VariantAnnotationHeader& header, headers) { if (header.name().startsWith("ihdb_")) { removeColumnIfPresent(variants, header.name(), true); } } removeColumnIfPresent(variants, "classification", true); removeColumnIfPresent(variants, "classification_comment", true); removeColumnIfPresent(variants, "validated", true); removeColumnIfPresent(variants, "comment", true); //get required column indices QString num_samples = QString::number(sys_sample_ids.count()); int ihdb_hom_idx = addColumn(variants, "ihdb_hom", "Homozygous variant counts in NGSD for the same processing system (" + num_samples + " samples)."); int ihdb_het_idx = addColumn(variants, "ihdb_het", "Heterozyous variant counts in NGSD for the same processing system (" + num_samples + " samples)."); int ihdb_wt_idx = addColumn(variants, "ihdb_wt", "Wildtype variant counts in NGSD for the same processing system (" + num_samples + " samples)."); int ihdb_all_hom_idx = addColumn(variants, "ihdb_allsys_hom", "Homozygous variant counts in NGSD independent of the processing system."); int ihdb_all_het_idx = addColumn(variants, "ihdb_allsys_het", "Heterozygous variant counts in NGSD independent of the processing system."); int class_idx = addColumn(variants, "classification", "Classification from the NGSD."); int clacom_idx = addColumn(variants, "classification_comment", "Classification comment from the NGSD."); int valid_idx = addColumn(variants, "validated", "Validation information from the NGSD. Validation results of other samples are listed in brackets!"); if (variants.annotationIndexByName("comment", true, false)==-1) addColumn(variants, "comment", "Comments from the NGSD. Comments of other samples are listed in brackets!"); int comment_idx = variants.annotationIndexByName("comment", true, false); //(re-)annotate the variants SqlQuery query = getQuery(); for (int i=0; i<variants.count(); ++i) { //QTime timer; //timer.start(); //variant id Variant& v = variants[i]; QByteArray v_id = variantId(v, false).toLatin1(); //variant classification QVariant classification = getValue("SELECT class FROM variant_classification WHERE variant_id='" + v_id + "'", true); if (!classification.isNull()) { v.annotations()[class_idx] = classification.toByteArray().replace("n/a", ""); v.annotations()[clacom_idx] = getValue("SELECT comment FROM variant_classification WHERE variant_id='" + v_id + "'", true).toByteArray().replace("\n", " ").replace("\t", " "); } //int t_v = timer.elapsed(); //timer.restart(); //detected variant infos int dv_id = -1; QByteArray comment = ""; if (found_in_db) { query.exec("SELECT id, comment FROM detected_variant WHERE processed_sample_id='" + ps_id + "' AND variant_id='" + v_id + "'"); if (query.size()==1) { query.next(); dv_id = query.value(0).toInt(); comment = query.value(1).toByteArray(); } } //validation info int vv_id = -1; QByteArray val_status = ""; if (found_in_db) { query.exec("SELECT id, status FROM variant_validation WHERE sample_id='" + s_id + "' AND variant_id='" + v_id + "'"); if (query.size()==1) { query.next(); vv_id = query.value(0).toInt(); val_status = query.value(1).toByteArray().replace("n/a", ""); } } //int t_dv = timer.elapsed(); //timer.restart(); //validation info other samples int tps = 0; int fps = 0; query.exec("SELECT id, status FROM variant_validation WHERE variant_id='"+v_id+"' AND status!='n/a'"); while(query.next()) { if (query.value(0).toInt()==vv_id) continue; if (query.value(1).toByteArray()=="true positive") ++tps; else if (query.value(1).toByteArray()=="false positive") ++fps; } if (tps>0 || fps>0) { if (val_status=="") val_status = "n/a"; val_status += " (" + QByteArray::number(tps) + "xTP, " + QByteArray::number(fps) + "xFP)"; } //int t_val = timer.elapsed(); //timer.restart(); //comments other samples QList<QByteArray> comments; query.exec("SELECT id, comment FROM detected_variant WHERE variant_id='"+v_id+"' AND comment IS NOT NULL"); while(query.next()) { if (query.value(0).toInt()==dv_id) continue; QByteArray tmp = query.value(1).toByteArray().trimmed(); if (tmp!="") comments.append(tmp); } if (comments.size()>0) { if (comment=="") comment = "n/a"; comment += " ("; for (int i=0; i<comments.count(); ++i) { if (i>0) { comment += ", "; } comment += comments[i]; } comment += ")"; } //int t_com = timer.elapsed(); //timer.restart(); //genotype counts int allsys_hom_count = 0; int allsys_het_count = 0; int sys_hom_count = 0; int sys_het_count = 0; QSet<int> s_ids_done; int s_id_int = s_id.toInt(); query.exec("SELECT dv.genotype, ps.sample_id FROM detected_variant as dv, processed_sample ps WHERE dv.processed_sample_id=ps.id AND dv.variant_id='" + v_id + "'"); while(query.next()) { //skip this sample id int current_sample = query.value(1).toInt(); if (current_sample==s_id_int) continue; //skip already seen samples (there could be several processings of the same sample because of different processing systems or because of experment repeats due to quality issues) if (s_ids_done.contains(current_sample)) continue; s_ids_done.insert(current_sample); QByteArray current_geno = query.value(0).toByteArray(); if (current_geno=="hom") { ++allsys_hom_count; if (sys_sample_ids.contains(current_sample)) { ++sys_hom_count; } } else if (current_geno=="het") { ++allsys_het_count; if (sys_sample_ids.contains(current_sample)) { ++sys_het_count; } } } //qDebug() << (v.isSNV() ? "S" : "I") << query.size() << t_v << t_dv << t_val << t_com << timer.elapsed(); v.annotations()[ihdb_all_hom_idx] = QByteArray::number(allsys_hom_count); v.annotations()[ihdb_all_het_idx] = QByteArray::number(allsys_het_count); if (found_in_db) { v.annotations()[ihdb_hom_idx] = QByteArray::number((double)sys_hom_count / sys_sample_ids.count(), 'f', 4); v.annotations()[ihdb_het_idx] = QByteArray::number((double)sys_het_count / sys_sample_ids.count(), 'f', 4); v.annotations()[ihdb_wt_idx] = QByteArray::number((double)(sys_sample_ids.count() - sys_hom_count - sys_het_count) / sys_sample_ids.count(), 'f', 4); v.annotations()[valid_idx] = val_status; v.annotations()[comment_idx] = comment.replace("\n", " ").replace("\t", " "); } else { v.annotations()[ihdb_hom_idx] = "n/a"; v.annotations()[ihdb_het_idx] = "n/a"; v.annotations()[ihdb_wt_idx] = "n/a"; v.annotations()[valid_idx] = "n/a"; v.annotations()[comment_idx] = "n/a"; } emit updateProgress(100*i/variants.count()); } }