//////////////////////////////////////////////////////////////////////// // Main prefilter function //////////////////////////////////////////////////////////////////////// void Prefilter::prefilter_db(HMM* q_tmp, Hash<Hit>* previous_hits, const int threads, const int prefilter_gap_open, const int prefilter_gap_extend, const int prefilter_score_offset, const int prefilter_bit_factor, const double prefilter_evalue_thresh, const double prefilter_evalue_coarse_thresh, const int preprefilter_smax_thresh, const int min_prefilter_hits, const int maxnumdb, const float R[20][20], std::vector<std::pair<int, std::string> >& new_prefilter_hits, std::vector<std::pair<int, std::string> >& old_prefilter_hits) { Hash<char>* doubled = new Hash<char>; doubled->New(16381, 0); int element_count = (VECSIZE_INT * 4); //W = (LQ+15) / 16; // band width = hochgerundetes LQ/16 int W = (q_tmp->L + (element_count - 1)) / element_count; // query profile (states + 1 because of ANY char) unsigned char* qc = (unsigned char*)malloc_simd_int((hh::NUMCOLSTATES+1)*(q_tmp->L+element_count)*sizeof(unsigned char)); stripe_query_profile(q_tmp, prefilter_score_offset, prefilter_bit_factor, W, qc); simd_int ** workspace = new simd_int *[threads]; std::vector<std::pair<int, int> > first_prefilter; std::vector<std::pair<double, int> > hits; int count_dbs = 0; int gap_init = prefilter_gap_open + prefilter_gap_extend; int gap_extend = prefilter_gap_extend; int LQ = q_tmp->L; const float log_qlen = flog2(LQ); const double factor = (double) num_dbs * LQ; for (int i = 0; i < threads; i++) workspace[i] = (simd_int*) malloc_simd_int( 3 * (LQ + element_count) * sizeof(char)); #pragma omp parallel for schedule(static) // Loop over all database sequences for (size_t n = 0; n < num_dbs; n++) { int thread_id = 0; #ifdef OPENMP thread_id = omp_get_thread_num(); #endif // Perform search step int score = ungapped_sse_score(qc, LQ, first[n], length[n], prefilter_score_offset, workspace[thread_id]); score = score - (int) (prefilter_bit_factor * (log_qlen + flog2(length[n]))); #pragma omp critical first_prefilter.push_back(std::pair<int, int>(score, n)); } //filter after calculation of ungapped sse score to include at least min_prefilter_hits std::vector<std::pair<int, int> >::iterator it; sort(first_prefilter.begin(), first_prefilter.end()); std::reverse(first_prefilter.begin(), first_prefilter.end()); std::vector<std::pair<int, int> >::iterator first_prefilter_begin_erase = first_prefilter.end(); std::vector<std::pair<int, int> >::iterator first_prefilter_end_erase = first_prefilter.end(); count_dbs = 0; for (it = first_prefilter.begin(); it < first_prefilter.end(); it++) { if (count_dbs >= min_prefilter_hits && (*it).first <= preprefilter_smax_thresh) { first_prefilter_begin_erase = it; break; } else { count_dbs++; } } first_prefilter.erase(first_prefilter_begin_erase, first_prefilter_end_erase); HH_LOG(INFO) << "HMMs passed 1st prefilter (gapless profile-profile alignment) : " << count_dbs << std::endl; #pragma omp parallel for schedule(static) // Loop over all database sequences // for (int n = 0; n < count_dbs; n++) { for (size_t i = 0; i < first_prefilter.size(); i++) { int thread_id = 0; #ifdef OPENMP thread_id = omp_get_thread_num(); #endif int n = first_prefilter[i].second; // Perform search step int score = swStripedByte(qc, LQ, first[n], length[n], gap_init, gap_extend, workspace[thread_id], workspace[thread_id] + W, workspace[thread_id] + 2 * W, prefilter_score_offset); double evalue = factor * length[n] * fpow2(-score / prefilter_bit_factor); if (evalue < prefilter_evalue_coarse_thresh) { #pragma omp critical hits.push_back(std::pair<double, int>(evalue, n)); } } //filter after calculation of evalues to include at least min_prefilter_hits sort(hits.begin(), hits.end()); std::vector<std::pair<double, int> >::iterator second_prefilter_begin_erase = hits.end(); std::vector<std::pair<double, int> >::iterator second_prefilter_end_erase = hits.end(); std::vector<std::pair<double, int> >::iterator it2; count_dbs = 0; for (it2 = hits.begin(); it2 < hits.end(); it2++) { if (count_dbs >= min_prefilter_hits && (*it2).first > prefilter_evalue_thresh) { second_prefilter_begin_erase = it2; break; } else { count_dbs++; } } hits.erase(second_prefilter_begin_erase, second_prefilter_end_erase); count_dbs = 0; for (it2 = hits.begin(); it2 < hits.end(); it2++) { // Add hit to dbfiles count_dbs++; char db_name[NAMELEN]; strcpy(db_name, dbnames[(*it2).second]); char name[NAMELEN]; RemoveExtension(name, db_name); if (!doubled->Contains(db_name)) { doubled->Add(db_name); std::pair<int, std::string> result; result.first = length[(*it2).second]; result.second = std::string(db_name); // check, if DB was searched in previous rounds strcat(name, "__1"); // irep=1 if (previous_hits->Contains(name)) { old_prefilter_hits.push_back(result); } else { new_prefilter_hits.push_back(result); } } if (count_dbs >= maxnumdb) { HH_LOG(WARNING) << "Number of hits passing 2nd prefilter (reduced from " << hits.size() << " to allowed maximum of " << maxnumdb << ").\n" <<"You can increase the allowed maximum using the -maxfilt <max> option.\n"; break; } } // Free memory free(qc); for (int i = 0; i < threads; i++) free(workspace[i]); delete[] workspace; if (doubled) delete doubled; }
int main(int argc, char* argv[]) { int i; int nums; FDATA fdata; Hash AddHash; BigFile DestBigFile; BigFile SourceBigFile; BigFileEntry *bfe; int numok,numnew,numupdated; int sourcetype; bool update; char *p1=0; char *p2=0; char *p3=0; int tosspath=0; DataHandle addhandle; int vargc; char **vargv; vargc=argc; vargv=argv; /* handle encrypted file */ kGUIProt DestProt; bool usedestprot; kGUIProt SourceProt; bool usesourceprot; kGUISystemBig sysbig; kGUI::SetSystem(&sysbig); signal(SIGINT, sigint_handler); optcompress=false; optrecursive=true; optverify=false; optdelete=false; optmissing=false; usedestprot=false; usesourceprot=false; #if 0 p1="/source/kgui/_data.big"; p2="/source/kgui/big"; p3="/source/kgui/big/"; optverify=true; #endif for(i=1; i<vargc; ++i) { if(vargv[i][0]=='-') { switch(vargv[i][1]) { case 'c': optcompress=true; break; case 'd': optdelete=true; break; case 'm': optmissing=true; /* delete missing files from subdir */ break; case 'v': optverify=true; break; case 'r': optrecursive=false; break; case 'k': if(vargv[i][2]=='d') /* encryption key on destination file */ { printf("using dest key\n"); if(DestProt.SetKey(vargv[i+1],atoi(vargv[i+2]),atoi(vargv[i+3]),true)==false) { printf("Error loading dest keyfile '%s'\n",vargv[i+1]); return(0); } usedestprot=true; i+=3; } else if(vargv[i][2]=='s') /* encryption key on source file */ { printf("using source key\n"); if(SourceProt.SetKey(vargv[i+1],atoi(vargv[i+2]),atoi(vargv[i+3]),true)==false) { printf("Error loading source keyfile '%s'\n",vargv[i+1]); return(0); } usesourceprot=true; i+=3; } optrecursive=false; break; default: printf("Unknown parm '%s'\n",vargv[i]); return(0); break; } } else { if(!p1) p1=vargv[i]; else if(!p2) p2=vargv[i]; else if(!p3) { p3=vargv[i]; tosspath=(int)strlen(p3); } else { printf("Unknown parm '%s'\n",vargv[i]); return(0); } } } /* need at least 1 parm */ if(!p1) { printf("kguibig: (c) 2005 Kevin Pickelll\n"); printf("usage: kguibig bigfile.big path [root]\n"); printf(" -c = compress\n"); printf(" -v = verify\n"); printf(" -r = don't recurse\n"); printf(" -k[d,s] = filename offset len // source/dest key\n"); return(0); } DestBigFile.SetFilename(p1); if(usedestprot==true) DestBigFile.SetEncryption(&DestProt); DestBigFile.Load(true); if(DestBigFile.IsBad()==true) { printf("Dest file exists and is not a bigfile, or decryption key is incorrect!\n"); return(0); } /* list, verify or compress ? */ if(!p2) { if(optcompress) { } else { unsigned long crc; unsigned long vfsize; BigFileEntry *sfe; unsigned char copybuf[65536]; DataHandle checkhandle; /* verify or list */ nums=DestBigFile.GetNumEntries(); for(i=0; ((i<nums) && (g_userabort==false)); ++i) { sfe=(BigFileEntry *)DestBigFile.GetEntry(i); if(optverify) { /* check crc and print if no match */ printf("%d%c",i,13); vfsize=sfe->m_size; crc=0; DestBigFile.CopyArea(&checkhandle,sfe->m_offset,sfe->m_size,sfe->m_time); checkhandle.Open(); while(vfsize>sizeof(copybuf)) { checkhandle.Read(©buf,(unsigned long)sizeof(copybuf)); crc=DestBigFile.CrcBuffer(crc,copybuf,sizeof(copybuf)); vfsize-=sizeof(copybuf); }; /* write remainder */ if(vfsize>0) { checkhandle.Read(©buf,vfsize); crc=DestBigFile.CrcBuffer(crc,copybuf,vfsize); } checkhandle.Close(); if(crc!=sfe->m_crc) printf("CRC Error on file '%s' %06x<>%06x\n",sfe->GetName()->GetString(),(int)crc,sfe->m_crc); } else /* assume list if verify is not set */ printf("%s, len=%d,crc=%06x\n",sfe->GetName()->GetString(),sfe->m_size,sfe->m_crc); } } return(0); } AddHash.Init(16,sizeof(FDATA)); /* is p2 a bigfile? */ if(strstr(p2,".big")) { SourceBigFile.SetFilename(p2); if(usesourceprot==true) SourceBigFile.SetEncryption(&SourceProt); SourceBigFile.Load(true); if(SourceBigFile.IsBad()==false) sourcetype=SOURCE_BIG; else { printf("Source file exists and is not a bigfile, or decryption key is incorrect!\n"); return(0); } } else if(kGUI::IsDir(p2)==false) { fdata.time=kGUI::SysFileTime(p2); //fdata.root=p3; AddHash.Add(p2,&fdata); sourcetype=SOURCE_FILE; } else { unsigned int df; const char *name; kGUIDir dir; printf("loading directory!\n"); dir.LoadDir(p2,true,true); for(df=0; df<dir.GetNumFiles(); ++df) { name=dir.GetFilename(df); fdata.time=kGUI::SysFileTime(name); AddHash.Add(name,&fdata); } // scandir(&AddHash,p2); sourcetype=SOURCE_DIR; } /* now, look for differences between bigfile and files in the addhash list */ numok=0; numnew=0; numupdated=0; /* todo: optdelete function */ /* add from source bigfile to dest bigfile will not work */ /* if source is encrypted so I need to rewrite addfile to use a datahandle */ /* instead of a filestream */ if(sourcetype==SOURCE_BIG) { BigFileEntry *sfe; nums=SourceBigFile.GetNumEntries(); for(i=0; ((i<nums) && (g_userabort==false)); ++i) { sfe=(BigFileEntry *)SourceBigFile.GetEntry(i); bfe=DestBigFile.Locate(sfe->GetName()->GetString()); if(!bfe) { // printf("File '%s' not in destination set!\n",she->m_string); update=true; ++numnew; } else { if(sfe->m_time==bfe->m_time) { update=false; ++numok; } else { int deltatime; deltatime=abs(sfe->m_time-bfe->m_time); if(deltatime==46400 || deltatime==3600) { update=false; ++numok; } else { printf("File '%s' %d,%d times are different!(%d)\n",sfe->GetName()->GetString(),sfe->m_time,bfe->m_time,deltatime); ++numupdated; update=true; } } } if(update==true) { SourceBigFile.CopyArea(&addhandle,sfe->m_offset,sfe->m_size,sfe->m_time); // addsize=sfe->m_size; // addtime=sfe->m_time; // fseek(addhandle,sfe->m_offset,SEEK_SET); /* add the file to the bigfile */ DestBigFile.AddFile(sfe->GetName()->GetString(),&addhandle,false); } } DestBigFile.UpdateDir(); } else { HashEntry *she; FDATA *sfdata; nums=AddHash.GetNum(); she=AddHash.GetFirst(); for(i=0; ((i<nums) && (g_userabort==false)); ++i) { sfdata=(FDATA *)she->m_data; bfe=DestBigFile.Locate(she->m_string+tosspath); if(!bfe) { // printf("File '%s' not in destination set!\n",she->m_string+tosspath); ++numnew; update=true; } else { if(sfdata->time==bfe->m_time) { update=false; ++numok; } else { int deltatime; deltatime=(abs((int)sfdata->time-bfe->m_time)); if(deltatime==46400 || deltatime==3600) { update=false; ++numok; } else { printf("File '%s' %d,%d times are different!(%d)\n",she->m_string+tosspath,(int)sfdata->time,bfe->m_time,deltatime); ++numupdated; update=true; } } } if(update==true) { addhandle.SetFilename(she->m_string); /* add the file to the bigfile */ DestBigFile.AddFile(she->m_string+tosspath,&addhandle,false); } she=she->m_next; } DestBigFile.UpdateDir(); } printf("numok=%d,numnew=%d,numupdated=%d\n",numok,numnew,numupdated); return 0; }