bool GStore::vardb_load_vcf( const std::string & file, const std::string & tag , const std::string & comment, Mask & mask , const std::set<std::string> & includes, const std::set<std::string> & excludes, const std::set<Region> * pfilter ) { // Load, parse VCF file; store variant and genotype information, and // meta-information, in vardb File * f = fIndex.file( file ); if ( f == NULL ) return false; VCFReader v( f , tag , &vardb , &seqdb ); // Selectively filter in/out meta-information? if ( includes.size() > 0 ) v.get_meta( includes ); if ( excludes.size() > 0 ) v.ignore_meta( excludes ); // add a region filter? if ( pfilter ) v.set_region_mask( pfilter ); // If SEQDB has been disabled if ( ! GP->seqdb.attached() ) v.set_seqdb( NULL ); // Do we want to apply up-front fix for X/Y genotypes? if ( mask.fixxy() ) { v.set_fixxy( &mask , &locdb, &inddb ); } vardb.begin(); int inserted = 0; plog.counter1( "parsing..." ); while ( v.parseLine() ) { if ( ++inserted % 1000 == 0 ) plog.counter1( "parsed " + Helper::int2str( inserted ) + " rows" ); } plog.counter1("\n"); // Wrap up vardb.commit(); return true; }
IterationReport VarDBase::vcf_iterate( void (*f)(Variant&, void *) , void * data , Mask & mask ) { IterationReport irep( true , mask.any_grouping() , mask.variant_limit() ); // VCF file name is kept in the Mask, by the 'ex-vcf' attribute std::string filename = mask.external_vcf_filename(); if ( filename != "-" ) Helper::checkFileExists( filename ); // Use VCFReader, into a temporary :memory: database // Load, parse VCF file; store variant and genotype information, and // meta-information, in vardb File vcffile( filename , VCF ); VCFReader v( &vcffile , "" , &(GP->vardb) , NULL ); // hack if ( filename == "-" ) { v.observed_header( true ); v.set_number_individuals( GP->indmap.size() ); } if ( mask.fixxy() ) { v.set_fixxy( &mask , &(GP->locdb), &(GP->inddb) ); } // Selectively filter in/out meta-information? // or, add a region filter? // We might not want to load the entire VCF into memory; & thus allow // the includes/excludes and filters to bring regions/tags into view // Respect 'reg', and 'loc' from command line. // But not loc.subset; ereg, loc.req, loc.ex, etc std::set<Region> filter; std::string locinc = mask.loc_include_string(); int l2 = 0; if ( locinc != "" && locinc != "." ) { if ( ! Helper::str2int( locinc , l2 ) ) Helper::halt("could not parse 'loc' mask"); if ( l2 > 0 ) filter = GP->locdb.get_regions( l2 ); } std::set<Region> reginc = mask.included_reg(); std::set<Region>::iterator ii = reginc.begin(); while ( ii != reginc.end() ) { filter.insert( *ii ); ++ii; } // Add other "reg" from mask? if ( filter.size() > 0 ) v.set_region_mask( &filter ); if ( mask.id() ) v.add_id_filter( mask.included_id() ); // Misc. settings. downcode_mode = mask.downcode(); // Work through VCF GP->vardb.begin(); int inserted = 0; v.return_variant( true ); Variant * pv = NULL; while ( 1 ) { VCFReader::line_t l = v.parseLine( &pv ); if ( l == VCFReader::VCF_EOF ) break; if ( l == VCFReader::VCF_INVALID ) { continue; } // If a variant line has been processed and meets criteria, pv // will be non-NULL which also implies that a Variant has been // created and we are reponsible for cleaning up afterwards if ( l == VCFReader::VCF_VARIANT && pv ) { // bad line, or failed a loc mask filter if ( ! pv->valid() ) { irep.rejected_variant(); delete pv; continue; } // So that the Variant functions know not to look for data // in a BLOB; also, they they know how to parse it downstream pv->set_vcf_buffer( v.gt_field , &v.formats ); // Apply all mask filters, and decide whether to call function if ( eval_and_call( mask, &(GP->indmap) , *pv , f , data ) ) { if ( ! irep.accepted_variant() ) break; } else { irep.rejected_variant(); } // and now clean up delete pv; } } // Wrap up GP->vardb.commit(); // If we had to use any positional filters or grouping, now run the // actual iterate on the temporary database we've created return irep; }