void linkwordadd(struct pagewordsFormat *pagewords, char word[]) { int wordlLength; wordlLength = strlen(word); convert_to_lowercase((unsigned char *)word); //-1 så vi ikke appender alt på slutten av siste linken hvis vi har mr en IndexerMaxLinks if ((IndexerMaxLinks)> (*pagewords).nrOfOutLinks) { if (!(*pagewords).outlinks[(*pagewords).nrOfOutLinks -1].good) { //dårlig link } else if (((*pagewords).outlinks[(*pagewords).nrOfOutLinks -1].linktextlen + wordlLength +2 ) > sizeof((*pagewords).outlinks[(*pagewords).nrOfOutLinks -1].linktext)) { //har ikke mer plass til link tekst } else { //må ha -1 her da det er den nåverende vi jobber på, ikke den neste strcpy((*pagewords).outlinks[(*pagewords).nrOfOutLinks -1].linktext + (*pagewords).outlinks[(*pagewords).nrOfOutLinks -1].linktextlen,word); (*pagewords).outlinks[(*pagewords).nrOfOutLinks -1].linktextlen += wordlLength; strcpy((*pagewords).outlinks[(*pagewords).nrOfOutLinks -1].linktext + (*pagewords).outlinks[(*pagewords).nrOfOutLinks -1].linktextlen," "); (*pagewords).outlinks[(*pagewords).nrOfOutLinks -1].linktextlen += 1; } } else { #ifdef DEBUG printf("note: to many links. Has %i sow far\n",(*pagewords).nrOfOutLinks); #endif } }
static int make_key(char *buf, size_t buflen, enum keytype type, struct mlfi_priv *priv, const char *rcpt) { int r = 0; switch (type) { case TO: r = snprintf(buf, buflen, "%s", rcpt); break; case TO_IP: r = snprintf(buf, buflen, "%s:%s", rcpt, priv->priv_ip); break; case TO_IP_FROM: r = snprintf(buf, buflen, "%s:%s:%s", rcpt, priv->priv_ip, priv->priv_from); break; case BOUNCE_TO: r = snprintf(buf, buflen, "%s:<>", rcpt); break; case BOUNCE_TO_IP: r = snprintf(buf, buflen, "%s:%s:<>", rcpt, priv->priv_ip); break; } if (r >= buflen) { return 0; } convert_to_lowercase (buf, r); return r; }
void attribadd(struct IndexerRes_attrib *attrib, char word[]) { convert_to_lowercase((unsigned char *)word); #ifdef DEBUG printf("attribadd: got \"%s\"\n",word); #endif if ((*attrib).attribnr > maxAttribForPage){ #ifdef DEBUG printf("more than maxAttribForPage words\n"); #endif } else { (*attrib).attrib[(*attrib).attribnr].WordID = crc32boitho(word); (*attrib).attrib[(*attrib).attribnr].position = 0; ++(*attrib).attribnr; } }
void acladd(struct IndexerRes_acls *acl, char word[]) { convert_to_lowercase((unsigned char *)word); #ifdef DEBUG printf("acladd: got \"%s\"\n",word); #endif if ((*acl).aclnr > maxAclForPage){ #ifdef DEBUG printf("mor then maxAclForPage words\n"); #endif } else { (*acl).acls[(*acl).aclnr].WordID = crc32boitho(word); (*acl).acls[(*acl).aclnr].position = 0; ++(*acl).aclnr; } }
void suggest_1(char *host, char *arg, char *user, char *collection) { CLIENT *clnt; numbest_res *result_1; struct senddata args; char first[1024]; char suggeston[1024]; char **wordlist; int splitn; splitn = split(arg, " ", &wordlist); if (splitn == 0) return; args.word = wordlist[splitn-1]; convert_to_lowercase(args.word); args.user = user; args.collection = (collection ? collection : ""); #ifndef DEBUG clnt = clnt_create (host, SUGGEST, SUGGESTVERS, "udp"); if (clnt == NULL) { clnt_pcreateerror (host); exit (1); } #endif /* DEBUG */ /* XXX: set lower timeout */ result_1 = get_best_results_2(&args, clnt); if (!result_1) { #if 1 //clnt_perror (clnt, "call failed"); #endif } else { if (result_1->_errno == 0) { namelist nl; int i; if (strlen(args.word) > 0) { for (nl = result_1->numbest_res_u.list; nl != NULL; nl = nl->next) { for (i = 0; i < splitn-1; i++) printf("%s ", wordlist[i]); printf("%s\n", nl->name); } } else { for (i = 0; i < splitn-1; i++) printf("%s ", wordlist[i]); puts(""); } } } FreeSplitList(wordlist); #ifndef DEBUG clnt_destroy (clnt); #endif /* DEBUG */ }
void fn( char* word, int pos, enum parsed_unit pu, enum parsed_unit_flag puf, void* pagewords ) { #ifdef DEBUG printf("\t%s (%i) ", word, pos); printf("type %i ",pu); #endif switch (pu) { case pu_word: switch (puf) { case puf_none: printf(" none"); break; case puf_title: printf(" +title"); break; case puf_h1: printf(" +h1"); break; case puf_h2: printf(" +h2"); break; case puf_h3: printf(" +h3"); break; case puf_h4: printf(" +h4"); break; case puf_h5: printf(" +h5"); break; case puf_h6: printf(" +h6"); break; } convert_to_lowercase(word); printf("[word] is now %s (crc32 %u, pos %i)", word, crc32boitho(word), pos); break; case pu_linkword: printf("[linkword]"); break; case pu_link: printf("[link]"); break; case pu_baselink: printf("[baselink]"); break; case pu_meta_keywords: printf("[meta keywords]"); break; case pu_meta_description: printf("[meta description]"); break; case pu_meta_author: printf("[meta author]"); break; default: printf("[...]"); } printf("\n"); }
int main (int argc, char *argv[]) { int lotNr; int i; unsigned int DocID; char text[50]; unsigned int radress; unsigned int rsize; char **Data; int Count, TokCount; unsigned short hits; unsigned long WordID; int bucket; int y; int nr; FILE *revindexFilesHa[NrOfDataDirectorys]; unsigned char lang; FILE *FH; unsigned int DocIDPlace; int *nrOfLinkWordsToDocID = malloc(sizeof(int) * NrofDocIDsInLot); for (i=0;i<NrofDocIDsInLot;i++) { //begynner på 2000 så det skal være lett og skille de visuelt fra andre hits nrOfLinkWordsToDocID[i] = 2000; } //tester for at vi har fåt hvilken lot vi skal bruke if (argc < 3) { printf("Usage: ./anchorread lotnr subname\n\n"); exit(1); } lotNr = atoi(argv[1]); char *subname = argv[2]; if ( (FH = lotOpenFileNoCasheByLotNr(lotNr,"anchors","rb", 's',subname)) == NULL) { printf("lot dont have a anchors file\n"); exit(1); } fclose(FH); revindexFilesOpenLocal(revindexFilesHa,lotNr,"Anchor","wb",subname); //int anchorGetNext (int LotNr,unsigned int *DocID,char *text,unsigned int *radress,unsigned int *rsize) while (anchorGetNext(lotNr,&DocID,text,sizeof(text),&radress,&rsize,subname) ) { DocIDPlace = (DocID - LotDocIDOfset(rLotForDOCid(DocID))); ++nrOfLinkWordsToDocID[DocIDPlace]; convert_to_lowercase((unsigned char *)text); #ifdef DEBUG if (DocID == 4999999) { printf("DocID %i, text: \"%s\", DocIDPlace %i, nrOfLinkWordsToDocID %i\n",DocID,text,DocIDPlace,nrOfLinkWordsToDocID[DocIDPlace]); } #endif if ((TokCount = split(text, " ", &Data)) == -1) { printf("canæt splitt \"%s\"\n",text); } //for (i=(TokCount-1);i>=0;i--) { i=0; while (Data[i] != NULL) { /* if (nrOfLinkWordsToDocID[DocIDPlace] > 65505) { #ifdef DEBUG if (DocID == 4999999) { printf("reach max nr of words for DocID %u. Hav %i+ words\n",DocID,nrOfLinkWordsToDocID[DocIDPlace]); } #endif break; } */ if (Data[i][0] == '\0') { #ifdef DEBUG if (DocID == 4999999) { printf("emty data element\n"); } #endif } else if (strcmp(Data[i],"www") == 0) { #ifdef DEBUG if (DocID == 4999999) { printf("www\n"); } #endif ++nrOfLinkWordsToDocID[DocIDPlace]; } else if (isStoppWord(Data[i])) { #ifdef DEBUG if (DocID == 4999999) { printf("stopword \"%s\"\n",Data[i]); } #endif //++nrOfLinkWordsToDocID[DocIDPlace]; } else { #ifdef DEBUG if (DocID == 4999999) { printf("\t\"%s\" %i\n",Data[i],nrOfLinkWordsToDocID[DocIDPlace]); } #endif WordID = crc32boitho(Data[i]); if (WordID == 0) { printf("got 0 as word id for \"%s\". Somthing may be wrong.\n",Data[i]); } bucket = WordID % NrOfDataDirectorys; if (nrOfLinkWordsToDocID[DocIDPlace] > 65535) { hits = 65535; } else { hits = nrOfLinkWordsToDocID[DocIDPlace]; } #ifdef DEBUG if (DocID == 4999999) { printf("\thits %i: \"%s\": %hu, bucket %i\n",i,Data[i],hits,bucket); } #endif if (fwrite(&DocID,sizeof(unsigned int),1,revindexFilesHa[bucket]) != 1) { perror("fwrite DocID"); } //runarb: 13 mai 2007. vi har byttet til å bruke et tal for språk. //burde da dette fra DocumentIndex hvis det finnes, men lagres ikke der //må si i IndexRes på hvordan vi gjør det der //fprintf(revindexFilesHa[bucket],"aa "); lang = 0; nr = 1; if(fwrite(&lang,sizeof(unsigned char),1,revindexFilesHa[bucket]) != 1) { perror("fwrite lang"); } if(fwrite(&WordID,sizeof(unsigned long),1,revindexFilesHa[bucket]) != 1) { perror("fwrite WordID"); } if(fwrite(&nr,sizeof(unsigned long),1,revindexFilesHa[bucket]) != 1) { perror("fwrite nr"); } if(fwrite(&hits,sizeof(unsigned short),1,revindexFilesHa[bucket]) != 1) { perror("fwrite hits"); } ++nrOfLinkWordsToDocID[DocIDPlace]; } ++i; } FreeSplitList(Data); #ifdef DEBUG if (DocID == 4999999) { printf("\n"); } #endif } free(nrOfLinkWordsToDocID); }
void wordsAdd(struct pagewordsFormatPartFormat *wordsPart, char word[],enum parsed_unit_flag puf) { int wordlLength; int wordTypeadd = 0; if (wordsPart->nr > maxWordForPage){ #ifdef DEBUG printf("mor then maxWordForPage words\n"); #endif } else { switch (puf) { case puf_none: //printf(" +p"); wordTypeadd=1000;break; case puf_title: //printf(" +title"); wordTypeadd=100; break; case puf_h1: //printf(" +h1"); wordTypeadd=500; break; case puf_h2: //printf(" +h2"); wordTypeadd=500; break; case puf_h3: //printf(" +h3"); wordTypeadd=500; break; case puf_h4: //printf(" +h4"); wordTypeadd=500; break; case puf_h5: //printf(" +h5"); wordTypeadd=500; break; case puf_h6: //printf(" +h6"); wordTypeadd=500; break; default: printf(" no catsh\n"); break; } wordlLength = strlen(word); //gjør om til små bokstaver convert_to_lowercase((unsigned char *)word); #ifdef PRESERVE_WORDS strcpy(wordsPart->words[wordsPart->nr].word,word); #endif wordsPart->words[wordsPart->nr].WordID = crc32boitho(word); #ifdef DEBUG printf(" (crc %s -> %u) ",word,wordsPart->words[wordsPart->nr].WordID); #endif wordsPart->words[wordsPart->nr].position = (wordsPart->nextPosition + wordTypeadd); // må ha en index posisjon her. Slik at vi kan finne ord før og etter. Posisjon er kodet wordsPart->words[wordsPart->nr].unsortetIndexPosition = wordsPart->nr; ++wordsPart->nextPosition; //printf("%s : %u\n",word,wordsPart->words[wordsPart->nr]); ++wordsPart->nr; } }