double MultiProbeLshModel::recall (double x) const { double x2 = W_ / x; double p = col_helper(x2); unsigned MT = __probeSequenceTemplates[M_].size(); if (MT > T_) MT = T_; double result = 0; for (unsigned i = 0; i < MT; i++) { double r = 1.0; for (unsigned j = 0; j < M_; j++) { Probe &probe = __probeSequenceTemplates[M_][i]; if (probe.mask & leftshift(j)) { double delta = (j + 1.0) / (M_ + 1.0) * 0.5; // expected value if (probe.shift & leftshift(j)) { r *= p_col_helper(x2, 1.0 - delta); } else { r *= p_col_helper(x2, delta); } } else r *= p; } result += r; } return 1.0 - std::exp(std::log(1.0 - result) * L_); }
void MultiProbeLsh::genProbeSequence (Domain obj, std::vector<unsigned> &seq, unsigned T) const { ProbeSequence scores; std::vector<unsigned> base; scores.resize(2 * lsh_.size()); base.resize(lsh_.size()); for (unsigned i = 0; i < lsh_.size(); ++i) { float delta; base[i] = Super::lsh_[i](obj, &delta); scores[2*i].mask = i; scores[2*i].reserve = 1; // direction scores[2*i].score = delta; scores[2*i+1].mask = i; scores[2*i+1].reserve = unsigned(-1); scores[2*i+1].score = 1.0 - delta; } std::sort(scores.begin(), scores.end()); const ProbeSequence &tmpl = (*ProbeSequenceTemplates::get())[lsh_.size()]; seq.clear(); for (ProbeSequence::const_iterator it = tmpl.begin(); it != tmpl.end(); ++it) { if (seq.size() == T) break; const Probe &probe = *it; unsigned hash = 0; for (unsigned i = 0; i < lsh_.size(); ++i) { unsigned h = base[scores[i].mask]; if (probe.mask & leftshift(i)) { if (probe.shift & leftshift(i)) { h += scores[i].reserve; } else { h += unsigned(-1) * scores[i].reserve; } } hash += h * a_[scores[i].mask]; } seq.push_back(hash % H_); } }
/*从字符串中抽取所有的超链接,移除左侧包含所有超链接的最短子串,返回剩余子串的长度*/ int extractLink(char *buf, char *domain) { const char *regex = "href=\"[^ >]*\""; regex_t preg; regmatch_t pm[MAXMATCH]; int nmatch = MAXMATCH; char tmp[MAX_LINK_LEN]; if (regcomp(&preg, regex, REG_EXTENDED|REG_ICASE) != 0) { /*编译正则表达式失败 */ debug_printf("%s %d init regex failed \n",__func__,__LINE__); return leftshift(buf); } int z, i; z = regexec(&preg, buf, nmatch, pm, 0); regfree(&preg); if (z == REG_NOMATCH) { /*无匹配项 */ return leftshift(buf); } else { /*有匹配的超链接 */ for (i = 0; i < nmatch && pm[i].rm_so != -1; ++i) { /*把超链接都提取出来 */ int bpos = pm[i].rm_so + 6; int epos = pm[i].rm_eo - 2; int len = epos - bpos + 1; strncpy(tmp, buf + bpos, len); tmp[len] = '\0'; debug_printf("%s %d original link:[%p]\n",__func__,__LINE__,tmp); Url *temp = calloc(1, sizeof(Url)); if(!temp) continue; if(patchlink(tmp, domain,temp)) { free(temp); continue; } debug_printf("%s %d whole link:[%p]\n",__func__,__LINE__,temp->str); enqueue(temp); } return leftshift(buf + pm[nmatch - 1].rm_eo); } }
void GenExpectScores (ProbeSequence &seq, unsigned M) { assert(M <= sizeof(seq[0].mask)* 8); seq.resize(2 * M); for (unsigned l = 0; l < M; ++l) { unsigned r = 2 * M - l - 1; seq[l].mask = seq[r].mask = seq[r].shift = leftshift(l); seq[l].shift = 0; seq[l].reserve = seq[r].reserve = 0; float delta = (l + 1.0) / (M + 1.0) * 0.5; seq[l].score = (l + 1.0) * (l + 2.0) / (M + 1.0) / (M + 2.0) * 0.25; seq[r].score = 1.0 - 2.0 * delta + seq[l].score; } }
void randomization() { unsigned char ch; int a=0,j,k,i,check,nc1; int nl; long int nblock; int ii,jj,ii1,jj1; for(a=1; a<=times; a++) { check=a%5; switch(check) { case 0 : cycling(); upshift(); rightshift(); downshift(); leftshift(); downshift(); rightshift(); upshift(); cycling(); break; case 1 : leftshift(); cycling(); upshift(); rightshift(); downshift(); rightshift(); upshift(); cycling(); leftshift(); break; case 2 : downshift(); leftshift(); cycling(); upshift(); rightshift(); upshift(); cycling(); leftshift(); downshift(); break; case 3 : rightshift(); downshift(); leftshift(); cycling(); upshift(); cycling(); leftshift(); downshift(); rightshift(); break; case 4 : upshift(); rightshift(); downshift(); leftshift(); cycling(); leftshift(); downshift(); rightshift(); upshift(); break; } } }