bool NGSD::removeColumnIfPresent(VariantList& variants, QString name, bool exact_name_match) { int index = variants.annotationIndexByName(name, exact_name_match, false); if (index==-1) return false; variants.removeAnnotation(index); return true; }
QString LovdUploadFile::getAnnotation(const VariantList& vl, const Variant& variant, QString key) { return variant.annotations()[vl.annotationIndexByName(key)]; }
void NGSD::annotate(VariantList& variants, QString filename) { initProgress("NGSD annotation", true); //get sample ids QString s_id = sampleId(filename, false); QString ps_id = processedSampleId(filename, false); QString sys_id = getValue("SELECT processing_system_id FROM processed_sample WHERE id='" + processedSampleId(filename, false) + "'").toString(); //check if we could determine the sample bool found_in_db = true; if (s_id=="" || ps_id=="" || sys_id=="") { Log::warn("Could not find processed sample in NGSD by name '" + filename + "'. Annotation will be incomplete because processing system could not be determined!"); found_in_db = false; } //get sample ids that have processed samples with the same processing system (not same sample, variants imported, same processing system, good quality of sample, not tumor) QSet<int> sys_sample_ids; SqlQuery tmp = getQuery(); tmp.exec("SELECT DISTINCT s.id FROM processed_sample as ps, sample s WHERE ps.processing_system_id='" + sys_id + "' AND ps.sample_id=s.id AND s.tumor='0' AND s.quality='good' AND s.id!='" + s_id + "' AND (SELECT count(id) FROM detected_variant as dv WHERE dv.processed_sample_id = ps.id)>0"); while(tmp.next()) { sys_sample_ids.insert(tmp.value(0).toInt()); } //remove all NGSD-specific columns QList<VariantAnnotationHeader> headers = variants.annotations(); foreach(const VariantAnnotationHeader& header, headers) { if (header.name().startsWith("ihdb_")) { removeColumnIfPresent(variants, header.name(), true); } } removeColumnIfPresent(variants, "classification", true); removeColumnIfPresent(variants, "classification_comment", true); removeColumnIfPresent(variants, "validated", true); removeColumnIfPresent(variants, "comment", true); //get required column indices QString num_samples = QString::number(sys_sample_ids.count()); int ihdb_hom_idx = addColumn(variants, "ihdb_hom", "Homozygous variant counts in NGSD for the same processing system (" + num_samples + " samples)."); int ihdb_het_idx = addColumn(variants, "ihdb_het", "Heterozyous variant counts in NGSD for the same processing system (" + num_samples + " samples)."); int ihdb_wt_idx = addColumn(variants, "ihdb_wt", "Wildtype variant counts in NGSD for the same processing system (" + num_samples + " samples)."); int ihdb_all_hom_idx = addColumn(variants, "ihdb_allsys_hom", "Homozygous variant counts in NGSD independent of the processing system."); int ihdb_all_het_idx = addColumn(variants, "ihdb_allsys_het", "Heterozygous variant counts in NGSD independent of the processing system."); int class_idx = addColumn(variants, "classification", "Classification from the NGSD."); int clacom_idx = addColumn(variants, "classification_comment", "Classification comment from the NGSD."); int valid_idx = addColumn(variants, "validated", "Validation information from the NGSD. Validation results of other samples are listed in brackets!"); if (variants.annotationIndexByName("comment", true, false)==-1) addColumn(variants, "comment", "Comments from the NGSD. Comments of other samples are listed in brackets!"); int comment_idx = variants.annotationIndexByName("comment", true, false); //(re-)annotate the variants SqlQuery query = getQuery(); for (int i=0; i<variants.count(); ++i) { //QTime timer; //timer.start(); //variant id Variant& v = variants[i]; QByteArray v_id = variantId(v, false).toLatin1(); //variant classification QVariant classification = getValue("SELECT class FROM variant_classification WHERE variant_id='" + v_id + "'", true); if (!classification.isNull()) { v.annotations()[class_idx] = classification.toByteArray().replace("n/a", ""); v.annotations()[clacom_idx] = getValue("SELECT comment FROM variant_classification WHERE variant_id='" + v_id + "'", true).toByteArray().replace("\n", " ").replace("\t", " "); } //int t_v = timer.elapsed(); //timer.restart(); //detected variant infos int dv_id = -1; QByteArray comment = ""; if (found_in_db) { query.exec("SELECT id, comment FROM detected_variant WHERE processed_sample_id='" + ps_id + "' AND variant_id='" + v_id + "'"); if (query.size()==1) { query.next(); dv_id = query.value(0).toInt(); comment = query.value(1).toByteArray(); } } //validation info int vv_id = -1; QByteArray val_status = ""; if (found_in_db) { query.exec("SELECT id, status FROM variant_validation WHERE sample_id='" + s_id + "' AND variant_id='" + v_id + "'"); if (query.size()==1) { query.next(); vv_id = query.value(0).toInt(); val_status = query.value(1).toByteArray().replace("n/a", ""); } } //int t_dv = timer.elapsed(); //timer.restart(); //validation info other samples int tps = 0; int fps = 0; query.exec("SELECT id, status FROM variant_validation WHERE variant_id='"+v_id+"' AND status!='n/a'"); while(query.next()) { if (query.value(0).toInt()==vv_id) continue; if (query.value(1).toByteArray()=="true positive") ++tps; else if (query.value(1).toByteArray()=="false positive") ++fps; } if (tps>0 || fps>0) { if (val_status=="") val_status = "n/a"; val_status += " (" + QByteArray::number(tps) + "xTP, " + QByteArray::number(fps) + "xFP)"; } //int t_val = timer.elapsed(); //timer.restart(); //comments other samples QList<QByteArray> comments; query.exec("SELECT id, comment FROM detected_variant WHERE variant_id='"+v_id+"' AND comment IS NOT NULL"); while(query.next()) { if (query.value(0).toInt()==dv_id) continue; QByteArray tmp = query.value(1).toByteArray().trimmed(); if (tmp!="") comments.append(tmp); } if (comments.size()>0) { if (comment=="") comment = "n/a"; comment += " ("; for (int i=0; i<comments.count(); ++i) { if (i>0) { comment += ", "; } comment += comments[i]; } comment += ")"; } //int t_com = timer.elapsed(); //timer.restart(); //genotype counts int allsys_hom_count = 0; int allsys_het_count = 0; int sys_hom_count = 0; int sys_het_count = 0; QSet<int> s_ids_done; int s_id_int = s_id.toInt(); query.exec("SELECT dv.genotype, ps.sample_id FROM detected_variant as dv, processed_sample ps WHERE dv.processed_sample_id=ps.id AND dv.variant_id='" + v_id + "'"); while(query.next()) { //skip this sample id int current_sample = query.value(1).toInt(); if (current_sample==s_id_int) continue; //skip already seen samples (there could be several processings of the same sample because of different processing systems or because of experment repeats due to quality issues) if (s_ids_done.contains(current_sample)) continue; s_ids_done.insert(current_sample); QByteArray current_geno = query.value(0).toByteArray(); if (current_geno=="hom") { ++allsys_hom_count; if (sys_sample_ids.contains(current_sample)) { ++sys_hom_count; } } else if (current_geno=="het") { ++allsys_het_count; if (sys_sample_ids.contains(current_sample)) { ++sys_het_count; } } } //qDebug() << (v.isSNV() ? "S" : "I") << query.size() << t_v << t_dv << t_val << t_com << timer.elapsed(); v.annotations()[ihdb_all_hom_idx] = QByteArray::number(allsys_hom_count); v.annotations()[ihdb_all_het_idx] = QByteArray::number(allsys_het_count); if (found_in_db) { v.annotations()[ihdb_hom_idx] = QByteArray::number((double)sys_hom_count / sys_sample_ids.count(), 'f', 4); v.annotations()[ihdb_het_idx] = QByteArray::number((double)sys_het_count / sys_sample_ids.count(), 'f', 4); v.annotations()[ihdb_wt_idx] = QByteArray::number((double)(sys_sample_ids.count() - sys_hom_count - sys_het_count) / sys_sample_ids.count(), 'f', 4); v.annotations()[valid_idx] = val_status; v.annotations()[comment_idx] = comment.replace("\n", " ").replace("\t", " "); } else { v.annotations()[ihdb_hom_idx] = "n/a"; v.annotations()[ihdb_het_idx] = "n/a"; v.annotations()[ihdb_wt_idx] = "n/a"; v.annotations()[valid_idx] = "n/a"; v.annotations()[comment_idx] = "n/a"; } emit updateProgress(100*i/variants.count()); } }