Exemplo n.º 1
0
////////////////////////////////////////////////////////////////////////
// Main prefilter function
////////////////////////////////////////////////////////////////////////
  void Prefilter::prefilter_db(HMM* q_tmp, Hash<Hit>* previous_hits,
      const int threads, const int prefilter_gap_open,
      const int prefilter_gap_extend, const int prefilter_score_offset,
      const int prefilter_bit_factor, const double prefilter_evalue_thresh,
      const double prefilter_evalue_coarse_thresh,
      const int preprefilter_smax_thresh, const int min_prefilter_hits, const int maxnumdb,
      const float R[20][20],
      std::vector<std::pair<int, std::string> >& new_prefilter_hits,
      std::vector<std::pair<int, std::string> >& old_prefilter_hits) {

    Hash<char>* doubled = new Hash<char>;
    doubled->New(16381, 0);

    int element_count = (VECSIZE_INT * 4);
    //W = (LQ+15) / 16;   // band width = hochgerundetes LQ/16
    int W = (q_tmp->L + (element_count - 1)) / element_count;
    // query profile (states + 1 because of ANY char)
    unsigned char* qc = (unsigned char*)malloc_simd_int((hh::NUMCOLSTATES+1)*(q_tmp->L+element_count)*sizeof(unsigned char));
    stripe_query_profile(q_tmp, prefilter_score_offset, prefilter_bit_factor, W, qc);

    simd_int ** workspace = new simd_int *[threads];

    std::vector<std::pair<int, int> > first_prefilter;
    std::vector<std::pair<double, int> > hits;

    int count_dbs = 0;
    int gap_init = prefilter_gap_open + prefilter_gap_extend;
    int gap_extend = prefilter_gap_extend;
    int LQ = q_tmp->L;
    const float log_qlen = flog2(LQ);
    const double factor = (double) num_dbs * LQ;

    for (int i = 0; i < threads; i++)
      workspace[i] = (simd_int*) malloc_simd_int(
          3 * (LQ + element_count) * sizeof(char));

#pragma omp parallel for schedule(static)
    // Loop over all database sequences
    for (size_t n = 0; n < num_dbs; n++) {
      int thread_id = 0;
#ifdef OPENMP
      thread_id = omp_get_thread_num();
#endif
      // Perform search step
      int score = ungapped_sse_score(qc, LQ, first[n], length[n],
          prefilter_score_offset, workspace[thread_id]);

      score = score
          - (int) (prefilter_bit_factor * (log_qlen + flog2(length[n])));

#pragma omp critical
      first_prefilter.push_back(std::pair<int, int>(score, n));
    }
    //filter after calculation of ungapped sse score to include at least min_prefilter_hits
    std::vector<std::pair<int, int> >::iterator it;

    sort(first_prefilter.begin(), first_prefilter.end());
    std::reverse(first_prefilter.begin(), first_prefilter.end());

    std::vector<std::pair<int, int> >::iterator first_prefilter_begin_erase =
        first_prefilter.end();
    std::vector<std::pair<int, int> >::iterator first_prefilter_end_erase =
        first_prefilter.end();
    count_dbs = 0;
    for (it = first_prefilter.begin(); it < first_prefilter.end(); it++) {
      if (count_dbs >= min_prefilter_hits
          && (*it).first <= preprefilter_smax_thresh) {
        first_prefilter_begin_erase = it;
        break;
      }
      else {
        count_dbs++;
      }
    }

    first_prefilter.erase(first_prefilter_begin_erase,
        first_prefilter_end_erase);

    HH_LOG(INFO)
        << "HMMs passed 1st prefilter (gapless profile-profile alignment)  : "
        << count_dbs << std::endl;

#pragma omp parallel for schedule(static)
    // Loop over all database sequences
//  for (int n = 0; n < count_dbs; n++) {
    for (size_t i = 0; i < first_prefilter.size(); i++) {
      int thread_id = 0;
#ifdef OPENMP
      thread_id = omp_get_thread_num();
#endif

      int n = first_prefilter[i].second;

      // Perform search step
      int score = swStripedByte(qc, LQ, first[n], length[n], gap_init,
          gap_extend, workspace[thread_id], workspace[thread_id] + W,
          workspace[thread_id] + 2 * W, prefilter_score_offset);

      double evalue = factor * length[n] * fpow2(-score / prefilter_bit_factor);

      if (evalue < prefilter_evalue_coarse_thresh) {
#pragma omp critical
        hits.push_back(std::pair<double, int>(evalue, n));
      }
    }

    //filter after calculation of evalues to include at least min_prefilter_hits
    sort(hits.begin(), hits.end());

    std::vector<std::pair<double, int> >::iterator second_prefilter_begin_erase =
        hits.end();
    std::vector<std::pair<double, int> >::iterator second_prefilter_end_erase =
        hits.end();
    std::vector<std::pair<double, int> >::iterator it2;

    count_dbs = 0;
    for (it2 = hits.begin(); it2 < hits.end(); it2++) {
      if (count_dbs >= min_prefilter_hits
          && (*it2).first > prefilter_evalue_thresh) {
        second_prefilter_begin_erase = it2;
        break;
      }
      else {
        count_dbs++;
      }
    }

    hits.erase(second_prefilter_begin_erase, second_prefilter_end_erase);

    count_dbs = 0;

    for (it2 = hits.begin(); it2 < hits.end(); it2++) {
      // Add hit to dbfiles
      count_dbs++;
      char db_name[NAMELEN];
      strcpy(db_name, dbnames[(*it2).second]);

      char name[NAMELEN];
      RemoveExtension(name, db_name);

      if (!doubled->Contains(db_name)) {
        doubled->Add(db_name);

        std::pair<int, std::string> result;
        result.first = length[(*it2).second];
        result.second = std::string(db_name);

        // check, if DB was searched in previous rounds
        strcat(name, "__1");  // irep=1

        if (previous_hits->Contains(name)) {
          old_prefilter_hits.push_back(result);
        }
        else {
          new_prefilter_hits.push_back(result);
        }
      }
      if (count_dbs >= maxnumdb)
      {
        HH_LOG(WARNING)
        << "Number of hits passing 2nd prefilter (reduced from " << hits.size() << " to allowed maximum of " << maxnumdb << ").\n"
        <<"You can increase the allowed maximum using the -maxfilt <max> option.\n";
        break;
      }
    }


      
      
    // Free memory
    free(qc);
    for (int i = 0; i < threads; i++)
      free(workspace[i]);
    delete[] workspace;
    if (doubled)
      delete doubled;
  }
Exemplo n.º 2
0
int main(int argc, char* argv[])
{
    int i;
    int nums;
    FDATA fdata;
    Hash AddHash;
    BigFile DestBigFile;
    BigFile SourceBigFile;
    BigFileEntry *bfe;
    int numok,numnew,numupdated;
    int sourcetype;
    bool update;
    char *p1=0;
    char *p2=0;
    char *p3=0;
    int tosspath=0;
    DataHandle addhandle;
    int vargc;
    char **vargv;
    vargc=argc;
    vargv=argv;

    /* handle encrypted file */
    kGUIProt DestProt;
    bool usedestprot;
    kGUIProt SourceProt;
    bool usesourceprot;
    kGUISystemBig sysbig;

    kGUI::SetSystem(&sysbig);
    signal(SIGINT, sigint_handler);
    optcompress=false;
    optrecursive=true;
    optverify=false;
    optdelete=false;
    optmissing=false;
    usedestprot=false;
    usesourceprot=false;

#if 0
    p1="/source/kgui/_data.big";
    p2="/source/kgui/big";
    p3="/source/kgui/big/";
    optverify=true;
#endif

    for(i=1; i<vargc; ++i)
    {
        if(vargv[i][0]=='-')
        {
            switch(vargv[i][1])
            {
            case 'c':
                optcompress=true;
                break;
            case 'd':
                optdelete=true;
                break;
            case 'm':
                optmissing=true;	/* delete missing files from subdir */
                break;
            case 'v':
                optverify=true;
                break;
            case 'r':
                optrecursive=false;
                break;
            case 'k':
                if(vargv[i][2]=='d')	/* encryption key on destination file */
                {
                    printf("using dest key\n");
                    if(DestProt.SetKey(vargv[i+1],atoi(vargv[i+2]),atoi(vargv[i+3]),true)==false)
                    {
                        printf("Error loading dest keyfile '%s'\n",vargv[i+1]);
                        return(0);
                    }
                    usedestprot=true;
                    i+=3;
                }
                else if(vargv[i][2]=='s') /* encryption key on source file */
                {
                    printf("using source key\n");
                    if(SourceProt.SetKey(vargv[i+1],atoi(vargv[i+2]),atoi(vargv[i+3]),true)==false)
                    {
                        printf("Error loading source keyfile '%s'\n",vargv[i+1]);
                        return(0);
                    }
                    usesourceprot=true;
                    i+=3;
                }
                optrecursive=false;
                break;
            default:
                printf("Unknown parm '%s'\n",vargv[i]);
                return(0);
                break;
            }
        }
        else
        {
            if(!p1)
                p1=vargv[i];
            else if(!p2)
                p2=vargv[i];
            else if(!p3)
            {
                p3=vargv[i];
                tosspath=(int)strlen(p3);
            }
            else
            {
                printf("Unknown parm '%s'\n",vargv[i]);
                return(0);
            }
        }
    }

    /* need at least 1 parm */
    if(!p1)
    {
        printf("kguibig: (c) 2005 Kevin Pickelll\n");
        printf("usage: kguibig bigfile.big path [root]\n");
        printf(" -c = compress\n");
        printf(" -v = verify\n");
        printf(" -r = don't recurse\n");
        printf(" -k[d,s] = filename offset len // source/dest key\n");
        return(0);
    }

    DestBigFile.SetFilename(p1);
    if(usedestprot==true)
        DestBigFile.SetEncryption(&DestProt);
    DestBigFile.Load(true);
    if(DestBigFile.IsBad()==true)
    {
        printf("Dest file exists and is not a bigfile, or decryption key is incorrect!\n");
        return(0);
    }

    /* list, verify or compress ? */
    if(!p2)
    {
        if(optcompress)
        {
        }
        else
        {
            unsigned long crc;
            unsigned long vfsize;
            BigFileEntry *sfe;
            unsigned char copybuf[65536];
            DataHandle checkhandle;

            /* verify or list */
            nums=DestBigFile.GetNumEntries();
            for(i=0; ((i<nums) && (g_userabort==false)); ++i)
            {
                sfe=(BigFileEntry *)DestBigFile.GetEntry(i);

                if(optverify)
                {
                    /* check crc and print if no match */
                    printf("%d%c",i,13);
                    vfsize=sfe->m_size;

                    crc=0;
                    DestBigFile.CopyArea(&checkhandle,sfe->m_offset,sfe->m_size,sfe->m_time);
                    checkhandle.Open();
                    while(vfsize>sizeof(copybuf))
                    {
                        checkhandle.Read(&copybuf,(unsigned long)sizeof(copybuf));
                        crc=DestBigFile.CrcBuffer(crc,copybuf,sizeof(copybuf));
                        vfsize-=sizeof(copybuf);
                    };
                    /* write remainder */
                    if(vfsize>0)
                    {
                        checkhandle.Read(&copybuf,vfsize);
                        crc=DestBigFile.CrcBuffer(crc,copybuf,vfsize);
                    }
                    checkhandle.Close();
                    if(crc!=sfe->m_crc)
                        printf("CRC Error on file '%s' %06x<>%06x\n",sfe->GetName()->GetString(),(int)crc,sfe->m_crc);
                }
                else	/* assume list if verify is not set */
                    printf("%s, len=%d,crc=%06x\n",sfe->GetName()->GetString(),sfe->m_size,sfe->m_crc);
            }
        }
        return(0);
    }

    AddHash.Init(16,sizeof(FDATA));

    /* is p2 a bigfile? */
    if(strstr(p2,".big"))
    {

        SourceBigFile.SetFilename(p2);
        if(usesourceprot==true)
            SourceBigFile.SetEncryption(&SourceProt);
        SourceBigFile.Load(true);
        if(SourceBigFile.IsBad()==false)
            sourcetype=SOURCE_BIG;
        else
        {
            printf("Source file exists and is not a bigfile, or decryption key is incorrect!\n");
            return(0);
        }
    }
    else if(kGUI::IsDir(p2)==false)
    {
        fdata.time=kGUI::SysFileTime(p2);
        //fdata.root=p3;
        AddHash.Add(p2,&fdata);
        sourcetype=SOURCE_FILE;
    }
    else
    {
        unsigned int df;

        const char *name;
        kGUIDir dir;

        printf("loading directory!\n");
        dir.LoadDir(p2,true,true);
        for(df=0; df<dir.GetNumFiles(); ++df)
        {
            name=dir.GetFilename(df);
            fdata.time=kGUI::SysFileTime(name);
            AddHash.Add(name,&fdata);
        }
//		scandir(&AddHash,p2);
        sourcetype=SOURCE_DIR;
    }
    /* now, look for differences between bigfile and files in the addhash list */

    numok=0;
    numnew=0;
    numupdated=0;

    /* todo: optdelete function */

    /* add from source bigfile to dest bigfile will not work */
    /* if source is encrypted so I need to rewrite addfile to use a datahandle */
    /* instead of a filestream */

    if(sourcetype==SOURCE_BIG)
    {
        BigFileEntry *sfe;

        nums=SourceBigFile.GetNumEntries();
        for(i=0; ((i<nums) && (g_userabort==false)); ++i)
        {
            sfe=(BigFileEntry *)SourceBigFile.GetEntry(i);
            bfe=DestBigFile.Locate(sfe->GetName()->GetString());
            if(!bfe)
            {
//				printf("File '%s' not in destination set!\n",she->m_string);
                update=true;
                ++numnew;
            }
            else
            {
                if(sfe->m_time==bfe->m_time)
                {
                    update=false;
                    ++numok;
                }
                else
                {
                    int deltatime;

                    deltatime=abs(sfe->m_time-bfe->m_time);
                    if(deltatime==46400 || deltatime==3600)
                    {
                        update=false;
                        ++numok;
                    }
                    else
                    {
                        printf("File '%s' %d,%d times are different!(%d)\n",sfe->GetName()->GetString(),sfe->m_time,bfe->m_time,deltatime);
                        ++numupdated;
                        update=true;
                    }
                }
            }

            if(update==true)
            {
                SourceBigFile.CopyArea(&addhandle,sfe->m_offset,sfe->m_size,sfe->m_time);
//				addsize=sfe->m_size;
//				addtime=sfe->m_time;
//				fseek(addhandle,sfe->m_offset,SEEK_SET);

                /* add the file to the bigfile */
                DestBigFile.AddFile(sfe->GetName()->GetString(),&addhandle,false);
            }
        }
        DestBigFile.UpdateDir();
    }
    else
    {
        HashEntry *she;
        FDATA *sfdata;

        nums=AddHash.GetNum();
        she=AddHash.GetFirst();
        for(i=0; ((i<nums) && (g_userabort==false)); ++i)
        {
            sfdata=(FDATA *)she->m_data;

            bfe=DestBigFile.Locate(she->m_string+tosspath);
            if(!bfe)
            {
//				printf("File '%s' not in destination set!\n",she->m_string+tosspath);
                ++numnew;
                update=true;
            }
            else
            {
                if(sfdata->time==bfe->m_time)
                {
                    update=false;
                    ++numok;
                }
                else
                {
                    int deltatime;

                    deltatime=(abs((int)sfdata->time-bfe->m_time));
                    if(deltatime==46400 || deltatime==3600)
                    {
                        update=false;
                        ++numok;
                    }
                    else
                    {
                        printf("File '%s' %d,%d times are different!(%d)\n",she->m_string+tosspath,(int)sfdata->time,bfe->m_time,deltatime);
                        ++numupdated;
                        update=true;
                    }
                }
            }
            if(update==true)
            {
                addhandle.SetFilename(she->m_string);
                /* add the file to the bigfile */
                DestBigFile.AddFile(she->m_string+tosspath,&addhandle,false);
            }

            she=she->m_next;
        }
        DestBigFile.UpdateDir();
    }
    printf("numok=%d,numnew=%d,numupdated=%d\n",numok,numnew,numupdated);

    return 0;
}