Beispiel #1
0
void score(pcooc_vector_t ** cooc_vec_ptr,/*pcooc_vector_t * res_vec,*/global_score_type * g_scores,unsigned long mem,unsigned lagg,
           const char * testset,const char * ngramsfname,pindicator_vector_t * pairs2keep )
{
    double phase1span,phase2span;
    global_score_type gs;
   
//    cout<<"Scoring started..."<<endl;
    Crono timer;
    timer.start();
    
    pcooc_vector_t * cooc_vec=*cooc_vec_ptr;
    pcooc_vector_t * res_vec=new pcooc_vector_t(cooc_vec->nb_sub_vecs());
    cooc_vec->print_sizes();
	gs.all_occurs=cooc_vec->size();
    cout<<endl;
    cout<<"-------------------------\n"
	      "Sorting (first run)\n"
		  "-------------------------"<<endl;
    sort_by_key (cooc_vec,mem);
    timer.stop(); 

   phase1span=timer.span();
    cout<<"\nDone sorting partial sets\nPartial sorting took "<<timer.formatted_span()<<endl;
    
    cout<<"\n-------------------------\n"
	        "Merging partial sorts\n"
			"-------------------------"<<endl;
    timer.reset();
    timer.start();

	aggregation_func selector=&max_double;
	switch(lagg){
		case OCC_AGG: selector=&max_occur;break;
		case AVG_AGG: selector=&average;break;			
			
	}
    
    Merger<cooc_type> * /*pcooc_vector_t::merge_iterator*/ g_merger=new Merger<cooc_type>(cooc_vec);//->get_merge_iterator();
     
    #pragma omp parallel shared(cooc_vec,res_vec, g_merger,selector)
    {
        g_merger->init(omp_get_thread_num());
        #pragma omp barrier
        
         while(true){

            score_one_source(res_vec,g_merger,selector);
            if(!(*g_merger))
                break;
//            cout<<omp_get_thread_num()<<": "<</*g_merger.count()<<*/"\t";
        }
    }


    timer.stop();

    cout<<"Done merging in: "<<timer.formatted_span()<<endl;

    cout<<//"\n-------------------------------------------------------\n"<<
            "\nFirst run of scoring done \nFirst run, including partial sorting, took "<<
            timer.formatted_span(phase1span)<<endl;

    delete g_merger;
    delete *cooc_vec_ptr;
//    cooc_vec->clear();
	//second sort by target sentences IDs
    *cooc_vec_ptr=new pcooc_vector_t(res_vec->nb_sub_vecs());
    cooc_vec=*cooc_vec_ptr;

    phase1span+=timer.span();
    timer.reset();
    timer.start();
    cout<<"\nNumber of unique pairs: "<<res_vec->size()<<endl;
    res_vec->print_sizes();
    cout<<endl;
    cout<<"---------------------------\nResorting (second run)\n---------------------------"<<endl;
    
    sort_by_key (res_vec,mem);
    
    timer.stop();
    phase2span=timer.span();
    cout<<"\nDone sorting partial sets\nPartial sorting took "<<timer.formatted_span()<<endl;
/*+++++++++++++++++++++++++++Allow subsampling at this stage++++++++++++++++++++++++++++++++++*/    
    pngram_vector_t * ngrams=NULL;
    if(testset != NULL || ngramsfname!= NULL){    
        ngrams= new pngram_vector_t(NB_WORKERS);
		timer.reset();
        timer.start();
        
        if(ngramsfname!= NULL){
            cout<<"\n########################\n"
			        "Loading the n-gram list\n"
					"########################\n"<<endl;
            pload_ngrams(ngramsfname, ngrams);
            cout<<"Done loading n-grams!\n"<<"n-gram list loaded in ";
        }else{
            cout<<"\n########################\n"
			        "Creating the n-gram list\n"
					"########################\n"<<endl;
            pget_ngrams(testset, ngrams);
            cout<<"Done creating n-grams!\n"<<"n-gram list created in ";
        }
        timer.stop();        
        cout<<timer.formatted_span()<<endl;
        timer.reset();
        cout<<"\n------------------------\n"
		        "Sorting the n-gram list\n"
				"------------------------\n"<<endl;        
        timer.start();
        sort_unique_ngrams(ngrams,mem);
        timer.stop();
        cout<<"Sorting n-grams took "<<timer.formatted_span()<<endl;
        
        
        
        if(testset){
            cout<<"\n------------------------\n"
			        "Saving the n-gram list\n"
					"------------------------\n"<<endl;
            timer.reset();
            timer.start();
            psave_ngrams(rsplit(testset,PATH_SEP).second.append(".ngrams").c_str(),ngrams);
            timer.stop();
            cout<<"Saving n-grams took "<<timer.formatted_span()<<endl;
        }
            
        
    }
    
 /*++++++++++++++++++++++++++END of ngram creation/loading+++++++++++++++++++++++++++++++++++++*/   
    if(!ngrams)
        cout<<"\n-------------------------\n"
		        "Merging partial sorts\n"
				"-------------------------"<<endl;
    else
        cout<<"\n---------------------------------------\n"
                "Merging and sub-sampling partial sorts\n"
                "---------------------------------------"<<endl;
    
    timer.reset();
    timer.start();
    Merger<ngram_t> * /*pcooc_vector_t::merge_iterator*/ g_ngmerger=NULL;
    if(ngrams)
        g_ngmerger=new Merger<ngram_t>(ngrams);
    
    g_merger=new Merger<cooc_type>(res_vec);//itr=res_vec->get_merge_iterator(cmp_by_val);
    
	//int more=0;
    /*
    tmp_vec.clear(); 
    unsigned long 	tocc,
                  invn[3];
                      
                  cooc_type::key_type new_trgt;
                  cout<<"Merging the sorted parts\n";
     */
                  unsigned long   big_n1=0,
                  big_n2=0,big_n3=0,big_n4=0,
                  n_matches=0;
    #pragma omp parallel shared(g_merger,g_ngmerger,cooc_vec,res_vec,ngrams,pairs2keep),\
    reduction(+:big_n1),reduction(+:big_n2),reduction(+:big_n3),reduction(+:big_n4), reduction(+:n_matches)
    {
         g_merger->init(omp_get_thread_num());
         if(g_ngmerger)
             g_ngmerger->init(omp_get_thread_num());
        #pragma omp barrier

         while(true){
            n_matches+=score_one_source(cooc_vec,&big_n1,&big_n2,&big_n3,&big_n4,g_merger,g_ngmerger,pairs2keep);//score_one_source(&lock,&lq,res_vec,g_merger);
            if(!(*g_merger))
                break;
//            cout<<omp_get_thread_num()<<": "<</*g_merger.count()<<*/"\t";
        }
    }
                
    timer.stop();
    cout<<"Done merging in: "<<timer.formatted_span()<<endl;

    cout<<"Disposing unneeded resources\n";
    timer.start();
    delete g_merger;
    delete res_vec;
    if(ngrams){
        cout<<"\nNumber of matching pairs to provided ngrams: "<<n_matches<<" ("<<100*double(n_matches)/cooc_vec->size()<<"%)"<<endl;
        delete g_ngmerger;
        delete ngrams;
    }

    gs.big_n1=big_n1;
    gs.big_n2=big_n2;
    gs.big_n3=big_n3;
    gs.big_n4=big_n4;
	//cout<<"Phrases more than 4 times: "<<more<<endl;
    gs.all=cooc_vec->size();
    if(g_scores){
        if(gs.big_n1==0){
            cerr<<"Warning: N1 was found to be zero, taking N1=1!\n";
            gs.big_n1=1;
        }
        if(gs.big_n2==0){
            cerr<<"Warning: N2 was found to be zero, taking N2=1!\n";
            gs.big_n2=1;
        }
        if(gs.big_n3==0){
            cerr<<"Warning: N3 was found to be zero, taking N3=1!\n";
            gs.big_n3=1;
        }
        memcpy(g_scores,&gs,sizeof(global_score_type));
    }

    timer.stop();
    cout<<"\nSecond run of scoring done\nSecond run, including partial sorting, took "<<
            timer.formatted_span(phase2span)<<endl;
    
    cout<<"\nTwo phase phrase scoring took "<<timer.formatted_span(phase1span+phase2span)<<endl<<endl;
	//calc corresponding scores    
}
Beispiel #2
0
template<class T, class H> void
CDR_Test<T, H>::do_test (int total, int niter, int use_array,
                         char* srcbuf, char* dstbuf,
                         int src_offset, int dst_offset)
{
  if (!use_array)
    {
      dst_offset = src_offset = 0;
    }

  ACE_DEBUG((LM_DEBUG,
             ACE_TEXT( "Starting Test for %s: %d elements " )
             ACE_TEXT( "%susing arrays.\n" ),
             H::name (),
             total,
             ((use_array) ? ACE_TEXT( "" ) : ACE_TEXT( "not " ))));


  if (!use_array && (total % 4) != 0)
    {
      int lasttotal = total;
      total -= (total % 4);
      ACE_DEBUG((LM_DEBUG,
                 ACE_TEXT( "Rounding from %d to %d elements.\n" ),
                 lasttotal,
                 total));
    }

  char* src = ACE_ptr_align_binary(srcbuf, H::size ());
  T* idata = reinterpret_cast<T*> (src);
  idata += src_offset;
  src = reinterpret_cast<char*> (idata);

  {
    int i;
    for (i = 0; i < total; i++)
      {
        idata[i] = CDR_Test<T, H>::checkval (i);
      }
  }

  ACE_DEBUG((LM_DEBUG,
             ACE_TEXT( "Writing data...\n" )));

  char* toread = 0;
  {
    ACE_TEST_ASSERT(use_array || total % 4 == 0);

    double totalsecs = 0.0;
    int n;
    for (n = 0; n < niter; n++)
      {
        size_t size = H::size () * (dst_offset + total) +
                      ACE_CDR::MAX_ALIGNMENT;
        ACE_OutputCDR os (dstbuf, size);

        // This is intrusive...
        char* const end = os.begin ()->wr_ptr() + size;

        do_seal (end);

        double secs = 0.0;
        if (use_array)
          {
            {
              int i;
              for (i = 0; i < dst_offset; i++)
                {
                  os << T(0);
                }
            }

            if (n == 0)
              {
                ACE_DEBUG((LM_DEBUG,
                           ACE_TEXT ("* src align = %d, dst align = %d\n"),
                           tellalign (src),
                           tellalign (os.begin ()->wr_ptr ())));
              }

            Crono crono;
            crono.start ();
            H::write_array (os, idata, total);
            crono.stop ();
            secs = crono.read_seconds ();
          }
        else
          {
            int i = 0;
            for (; i < dst_offset; i++)
              {
                os << T(0);
              }
            i = 0;

            Crono crono;
            crono.start();
            while (i < total)
              {
                os << idata[i++];
                os << idata[i++];
                os << idata[i++];
                os << idata[i++];
                // static char rs[32 + 1];
                // CDR_Test<T,H>::ttoh (idata[i], rs);
                // ACE_DEBUG ((LM_DEBUG, "Write idata[%d] = %s\n", i, rs));
                // os << idata[i];
                // i++;
              }
            crono.stop ();
            secs = crono.read_seconds ();
          }

        if (!check_seal(end))
          {
            ACE_ERROR((LM_ERROR,
                       ACE_TEXT( "Broken seal, aborting.\n" )));
            ACE_OS::exit(1);
          }

        totalsecs += secs;

        if (n == niter - 1)
          {
            toread = os.begin ()->rd_ptr ();
          }
      }

    totalsecs = totalsecs / niter;

    ACE_DEBUG((LM_DEBUG,
               ACE_TEXT ("Writing to stream %d %s values: %f seconds.\n"),
               total,
               H::name (),
               totalsecs));
  }

  {
    int i;
    for (i = 0; i < total; i++)
      {
        idata[i] = 0;
      }
  }

  ACE_DEBUG((LM_DEBUG,
             ACE_TEXT( "Reading them back in opposing byte order...\n" )));

  const int opposite_byte_order = 1 - ACE_CDR_BYTE_ORDER;

  {
    double totalsecs = 0.0;
    int n;
    for (n = 0; n < niter; n++)
      {
        ACE_DEBUG ((LM_DEBUG, ACE_TEXT ("====== Read iteration %d\n"), n));

        size_t size = (total + dst_offset) * H::size ();
        ACE_InputCDR is (toread, size, opposite_byte_order);

        // This is intrusive...
        char* const end = is.rd_ptr () + size;

        do_seal (end);

        double secs = 0.0;
        if (use_array)
          {
            {
              int i;
              for (i = 0; i < dst_offset; i++)
                {
                  T v;
                  is >> v;
                }
            }

            if (n == 0)
              {
                ACE_DEBUG((LM_DEBUG,
                           ACE_TEXT ("* src align = %d, dst align = %d\n"),
                           tellalign (is.rd_ptr ()),
                           tellalign (src)));
              }

            Crono crono;
            crono.start ();
            H::read_array (is, idata, total);
            crono.stop ();
            secs = crono.read_seconds ();

            // Testing for good bit value. Try reading atleast 10
            // times the size of total. It should fail with good bit
            // set to 0.
            H::read_array (is, idata, 10 * total);

            if (is.good_bit () != 0)
              {
                ACE_ERROR ((LM_ERROR,
                            ACE_TEXT ("Test for good bit failed in %s Array_test\n"),
                            H::name ()));
              }
          }
        else
          {
            int i = 0;
            Crono crono;
            crono.start ();
            while (i < total)
              {
#if 0
                T v;
                is >> v;
                static char rs[32 + 1];
                CDR_Test<T,H>::ttoh (v, rs);
                ACE_DEBUG ((LM_DEBUG, "Read idata[%d] = %s\n", i, rs));
                idata[i] = v;
                i++;
#else
                is >> idata[i++];
                is >> idata[i++];
                is >> idata[i++];
                is >> idata[i++];
#endif /* 0 */
              }
            crono.stop ();
            secs = crono.read_seconds ();
          }
        totalsecs += secs;

        if (!check_seal (end))
          {
            ACE_ERROR((LM_ERROR,
                       ACE_TEXT( "Broken seal, aborting.\n" )));
            ACE_OS::exit(1);
          }
      }

    totalsecs = totalsecs / niter;

    ACE_DEBUG((LM_DEBUG,
               ACE_TEXT ("Reading from stream %d %s values")
               ACE_TEXT (" (byte swapping): %f seconds.\n"),
               total,
               H::name (),
               totalsecs));
  }