Exemplo n.º 1
0
/*----------------------------------------------------------------------------------
  sparse_add_new_cut( new_col_H, new_cut, cut_length, nSel ) does the following:

    new_a = sum(data_X(:,find(new_cut ~=0 )),2);
    new_col_H = [sparse_A(:,1:nSel)'*new_a ; new_a'*new_a];
    sparse_A(:,nSel+1) = new_a;

  ---------------------------------------------------------------------------------*/
void* CWDSVMOcas::add_new_cut_helper( void* ptr)
{
	wdocas_thread_params_add* p = (wdocas_thread_params_add*) ptr;
	CWDSVMOcas* o = p->wdocas;
	int32_t start = p->start;
	int32_t end = p->end;
	int32_t string_length = o->string_length;
	//uint32_t nDim=(uint32_t) o->w_dim;
	uint32_t cut_length=p->cut_length;
	uint32_t* new_cut=p->new_cut;
	int32_t* w_offsets = o->w_offsets;
	float64_t* y = o->lab;
	int32_t alphabet_size = o->alphabet_size;
	float32_t* wd_weights = o->wd_weights;
	int32_t degree = o->degree;
	CStringFeatures<uint8_t>* f = o->features;
	float64_t normalization_const = o->normalization_const;

	// temporary vector
	float32_t* new_a = p->new_a;
	//float32_t* new_a = SG_MALLOC(float32_t, nDim);
	//memset(new_a, 0, sizeof(float32_t)*nDim);

	int32_t* val=SG_MALLOC(int32_t, cut_length);
	for (int32_t j=start; j<end; j++)
	{
		int32_t offs=o->w_dim_single_char*j;
		memset(val,0,sizeof(int32_t)*cut_length);
		int32_t lim=CMath::min(degree, string_length-j);
		int32_t len;

		for (int32_t k=0; k<lim; k++)
		{
			bool free_vec;
			uint8_t* vec = f->get_feature_vector(j+k, len, free_vec);
			float32_t wd = wd_weights[k]/normalization_const;

			for(uint32_t i=0; i < cut_length; i++)
			{
				val[i]=val[i]*alphabet_size + vec[new_cut[i]];
				new_a[offs+val[i]]+=wd * y[new_cut[i]];
			}
			offs+=w_offsets[k];
			f->free_feature_vector(vec, j+k, free_vec);
		}
	}

	//p->new_a=new_a;
	SG_FREE(val);
	return NULL;
}
float64_t CCommWordStringKernel::compute_diag(int32_t idx_a)
{
	int32_t alen;
	CStringFeatures<uint16_t>* l = (CStringFeatures<uint16_t>*) lhs;
	CStringFeatures<uint16_t>* r = (CStringFeatures<uint16_t>*) rhs;

	bool free_av;
	uint16_t* av=l->get_feature_vector(idx_a, alen, free_av);

	float64_t result=0.0 ;
	ASSERT(l==r)
	ASSERT(sizeof(uint16_t)<=sizeof(float64_t))
	ASSERT((1<<(sizeof(uint16_t)*8)) > alen)

	int32_t num_symbols=(int32_t) l->get_num_symbols();
	ASSERT(num_symbols<=dictionary_size)

	int32_t* dic = dict_diagonal_optimization;
	memset(dic, 0, num_symbols*sizeof(int32_t));

	for (int32_t i=0; i<alen; i++)
		dic[av[i]]++;

	if (use_sign)
	{
		for (int32_t i=0; i<(int32_t) l->get_num_symbols(); i++)
		{
			if (dic[i]!=0)
				result++;
		}
	}
	else
	{
		for (int32_t i=0; i<num_symbols; i++)
		{
			if (dic[i]!=0)
				result+=dic[i]*dic[i];
		}
	}
	l->free_feature_vector(av, idx_a, free_av);

	return result;
}
float64_t CCommWordStringKernel::compute_helper(
	int32_t idx_a, int32_t idx_b, bool do_sort)
{
	int32_t alen, blen;
	bool free_av, free_bv;

	CStringFeatures<uint16_t>* l = (CStringFeatures<uint16_t>*) lhs;
	CStringFeatures<uint16_t>* r = (CStringFeatures<uint16_t>*) rhs;

	uint16_t* av=l->get_feature_vector(idx_a, alen, free_av);
	uint16_t* bv=r->get_feature_vector(idx_b, blen, free_bv);

	uint16_t* avec=av;
	uint16_t* bvec=bv;

	if (do_sort)
	{
		if (alen>0)
		{
			avec=SG_MALLOC(uint16_t, alen);
			memcpy(avec, av, sizeof(uint16_t)*alen);
			CMath::radix_sort(avec, alen);
		}
		else
			avec=NULL;

		if (blen>0)
		{
			bvec=SG_MALLOC(uint16_t, blen);
			memcpy(bvec, bv, sizeof(uint16_t)*blen);
			CMath::radix_sort(bvec, blen);
		}
		else
			bvec=NULL;
	}
	else
	{
		if ( (l->get_num_preprocessors() != l->get_num_preprocessed()) ||
				(r->get_num_preprocessors() != r->get_num_preprocessed()))
		{
			SG_ERROR("not all preprocessors have been applied to training (%d/%d)"
					" or test (%d/%d) data\n", l->get_num_preprocessed(), l->get_num_preprocessors(),
					r->get_num_preprocessed(), r->get_num_preprocessors());
		}
	}

	float64_t result=0;

	int32_t left_idx=0;
	int32_t right_idx=0;

	if (use_sign)
	{
		while (left_idx < alen && right_idx < blen)
		{
			if (avec[left_idx]==bvec[right_idx])
			{
				uint16_t sym=avec[left_idx];

				while (left_idx< alen && avec[left_idx]==sym)
					left_idx++;

				while (right_idx< blen && bvec[right_idx]==sym)
					right_idx++;

				result++;
			}
			else if (avec[left_idx]<bvec[right_idx])
				left_idx++;
			else
				right_idx++;
		}
	}
	else
	{
		while (left_idx < alen && right_idx < blen)
		{
			if (avec[left_idx]==bvec[right_idx])
			{
				int32_t old_left_idx=left_idx;
				int32_t old_right_idx=right_idx;

				uint16_t sym=avec[left_idx];

				while (left_idx< alen && avec[left_idx]==sym)
					left_idx++;

				while (right_idx< blen && bvec[right_idx]==sym)
					right_idx++;

				result+=((float64_t) (left_idx-old_left_idx))*
					((float64_t) (right_idx-old_right_idx));
			}
			else if (avec[left_idx]<bvec[right_idx])
				left_idx++;
			else
				right_idx++;
		}
	}

	if (do_sort)
	{
		SG_FREE(avec);
		SG_FREE(bvec);
	}

	l->free_feature_vector(av, idx_a, free_av);
	r->free_feature_vector(bv, idx_b, free_bv);

	return result;
}
float64_t CWeightedCommWordStringKernel::compute_helper(
	int32_t idx_a, int32_t idx_b, bool do_sort)
{
	int32_t alen, blen;
	bool free_avec, free_bvec;

	CStringFeatures<uint16_t>* l = (CStringFeatures<uint16_t>*) lhs;
	CStringFeatures<uint16_t>* r = (CStringFeatures<uint16_t>*) rhs;

	uint16_t* av=l->get_feature_vector(idx_a, alen, free_avec);
	uint16_t* bv=r->get_feature_vector(idx_b, blen, free_bvec);

	uint16_t* avec=av;
	uint16_t* bvec=bv;

	if (do_sort)
	{
		if (alen>0)
		{
			avec=new uint16_t[alen];
			memcpy(avec, av, sizeof(uint16_t)*alen);
			CMath::radix_sort(avec, alen);
		}
		else
			avec=NULL;

		if (blen>0)
		{
			bvec=new uint16_t[blen];
			memcpy(bvec, bv, sizeof(uint16_t)*blen);
			CMath::radix_sort(bvec, blen);
		}
		else
			bvec=NULL;
	}
	else
	{
		if ( (l->get_num_preprocessors() != l->get_num_preprocessed()) ||
				(r->get_num_preprocessors() != r->get_num_preprocessed()))
		{
			SG_ERROR("not all preprocessors have been applied to training (%d/%d)"
					" or test (%d/%d) data\n", l->get_num_preprocessed(), l->get_num_preprocessors(),
					r->get_num_preprocessed(), r->get_num_preprocessors());
		}
	}

	float64_t result=0;
	uint8_t mask=0;

	for (int32_t d=0; d<degree; d++)
	{
		mask = mask | (1 << (degree-d-1));
		uint16_t masked=((CStringFeatures<uint16_t>*) lhs)->get_masked_symbols(0xffff, mask);

		int32_t left_idx=0;
		int32_t right_idx=0;
		float64_t weight=weights[d]*weights[d];

		while (left_idx < alen && right_idx < blen)
		{
			uint16_t lsym=avec[left_idx] & masked;
			uint16_t rsym=bvec[right_idx] & masked;

			if (lsym == rsym)
			{
				int32_t old_left_idx=left_idx;
				int32_t old_right_idx=right_idx;

				while (left_idx<alen && (avec[left_idx] & masked) ==lsym)
					left_idx++;

				while (right_idx<blen && (bvec[right_idx] & masked) ==lsym)
					right_idx++;

				result+=weight*(left_idx-old_left_idx)*(right_idx-old_right_idx);
			}
			else if (lsym<rsym)
				left_idx++;
			else
				right_idx++;
		}
	}

	if (do_sort)
	{
		delete[] avec;
		delete[] bvec;
	}

	l->free_feature_vector(av, idx_a, free_avec);
	r->free_feature_vector(bv, idx_b, free_bvec);

	return result;
}