void wordsAdd(char word[],enum parsed_unit_flag puf) { int i; int wordlLength; int wordTypeadd; if (pagewords.nr < maxWordForPage){ switch (puf) { case puf_none: //printf(" +p"); wordTypeadd=1000;break; case puf_title: //printf(" +title"); wordTypeadd=100; break; case puf_h1: //printf(" +h1"); wordTypeadd=500; break; case puf_h2: //printf(" +h2"); wordTypeadd=500; break; case puf_h3: //printf(" +h3"); wordTypeadd=500; break; case puf_h4: //printf(" +h4"); wordTypeadd=500; break; case puf_h5: //printf(" +h5"); wordTypeadd=500; break; case puf_h6: //printf(" +h6"); wordTypeadd=500; break; } wordlLength = strlen(word); //gjør om til små bokstaver for(i=0;i<wordlLength;i++) { word[i] = (char)tolower(word[i]); } #ifdef DEBUG_ADULT strcpy(pagewords.words[pagewords.nr].word,word); #endif pagewords.words[pagewords.nr].WordID = crc32boitho(word); pagewords.words[pagewords.nr].position = (pagewords.nextPosition + wordTypeadd); //printf("nextPosition %i, wordTypeadd %i, position %i\n",pagewords.nextPosition,wordTypeadd,pagewords.words[pagewords.nr].position); ++pagewords.nextPosition; //printf("%s : %lu\n",word,pagewords.words[pagewords.nr]); ++pagewords.nr; } else { //printf("To many words in dokument\n"); } }
void wordsAdd(char word[]) { int i; int wordlLength; if (pagewords.nr < maxWordForPage){ wordlLength = strlen(word); //gjør om til små bokstaver for(i=0;i<wordlLength;i++) { word[i] = (char)tolower(word[i]); } #ifdef DEBUG_ADULT strcpy(pagewords.words[pagewords.nr].word,word); #endif pagewords.words[pagewords.nr].WordID = crc32boitho(word); pagewords.words[pagewords.nr].position = pagewords.nextPosition; ++pagewords.nextPosition; //printf("%s : %lu\n",word,pagewords.words[pagewords.nr]); ++pagewords.nr; } else { //printf("To many words in dokument\n"); } }
void attribadd(struct IndexerRes_attrib *attrib, char word[]) { convert_to_lowercase((unsigned char *)word); #ifdef DEBUG printf("attribadd: got \"%s\"\n",word); #endif if ((*attrib).attribnr > maxAttribForPage){ #ifdef DEBUG printf("more than maxAttribForPage words\n"); #endif } else { (*attrib).attrib[(*attrib).attribnr].WordID = crc32boitho(word); (*attrib).attrib[(*attrib).attribnr].position = 0; ++(*attrib).attribnr; } }
void acladd(struct IndexerRes_acls *acl, char word[]) { convert_to_lowercase((unsigned char *)word); #ifdef DEBUG printf("acladd: got \"%s\"\n",word); #endif if ((*acl).aclnr > maxAclForPage){ #ifdef DEBUG printf("mor then maxAclForPage words\n"); #endif } else { (*acl).acls[(*acl).aclnr].WordID = crc32boitho(word); (*acl).acls[(*acl).aclnr].position = 0; ++(*acl).aclnr; } }
/************************************************************************************* * slår opp i databasen for å finne DoCID for en url *************************************************************************************/ int getDocIDFromUrl(char bdbfiledir[],char url[],unsigned int *DocID) { unsigned int crc32Value; int dbFileForUrl; int ret; DB *dbp; static char inited; static DB *dbp_store[nrOfUrlToDocIDFiles]; DBT key, data; char fileName[256]; crc32Value = crc32boitho(url); dbFileForUrl = (crc32Value % nrOfUrlToDocIDFiles); if (inited == 0) { int i; for(i = 0; i < nrOfUrlToDocIDFiles; i++) { sprintf(fileName,"%s%i.db",bdbfiledir,i); /* Create and initialize database object */ if ((ret = db_create(&dbp, NULL, 0)) != 0) { fprintf(stderr, "%s: db_create: %s\n", "getDocIDFromUrl", db_strerror(ret)); return (EXIT_FAILURE); } /* open the database. */ //if ((ret = dbp->open(dbp, NULL, fileName, NULL, DB_BTREE, DB_CREATE, 0444)) != 0) { if ((ret = dbp->open(dbp, NULL, fileName, NULL, DB_BTREE, DB_RDONLY, 0444)) != 0) { dbp->err(dbp, ret, "%s: open", fileName); //goto err1; } dbp_store[i] = dbp; } inited = 1; } dbp = dbp_store[dbFileForUrl]; //finner ut hvilken database vi skal opne //lager en has verdi slik at vi kan velge en av filene #ifdef DEBUG printf("Openig db %s\n",fileName); #endif /* Initialize the key/data pair so the flags aren't set. */ memset(&key, 0, sizeof(key)); memset(&data, 0, sizeof(data)); key.data = url; key.size = strlen(url); /* Walk through the database and print out the key/data pairs. */ if ((ret = dbp->get(dbp, NULL, &key, &data, 0)) == 0) { //printf("%s : %u-%i \n", key.data, *(int *)data.data,rLotForDOCid(*(int *)data.data)); *DocID = *(int *)data.data; return 1; } else if (ret == DB_NOTFOUND) { #ifdef DEBUG dbp->err(dbp, ret, "DBcursor->get"); #endif return 0; } else { dbp->err(dbp, ret, "DBcursor->get"); return 0; } }
void fn( char* word, int pos, enum parsed_unit pu, enum parsed_unit_flag puf, void* pagewords ) { #ifdef DEBUG printf("\t%s (%i) ", word, pos); printf("type %i ",pu); #endif switch (pu) { case pu_word: switch (puf) { case puf_none: printf(" none"); break; case puf_title: printf(" +title"); break; case puf_h1: printf(" +h1"); break; case puf_h2: printf(" +h2"); break; case puf_h3: printf(" +h3"); break; case puf_h4: printf(" +h4"); break; case puf_h5: printf(" +h5"); break; case puf_h6: printf(" +h6"); break; } convert_to_lowercase(word); printf("[word] is now %s (crc32 %u, pos %i)", word, crc32boitho(word), pos); break; case pu_linkword: printf("[linkword]"); break; case pu_link: printf("[link]"); break; case pu_baselink: printf("[baselink]"); break; case pu_meta_keywords: printf("[meta keywords]"); break; case pu_meta_description: printf("[meta description]"); break; case pu_meta_author: printf("[meta author]"); break; default: printf("[...]"); } printf("\n"); }
void adultLoad (struct adultFormat *adult) { FILE *FH; char buff[128]; int i,y,x; char *cpoint; char word1[128]; char word2[128]; int weight; unsigned long crc32tmp; //AdultWordsFile if ((FH = fopen(AdultWordsVektetFile,"r")) == NULL) { perror(AdultWordsVektetFile); exit(1); } i=0; while ((fgets(buff,sizeof(buff),FH) != NULL) && (i < maxAdultWords)) { //fjerner \n en på slutteten buff[strlen(buff) -1] = '\0'; //gjør om til lite case for(x=0;x<strlen(buff);x++) { buff[x] = tolower(buff[x]); } //finner space, som er det som skiller cpoint = strchr(buff,' '); if (cpoint != NULL) { strncpy((*adult).AdultWords[i].word,buff,cpoint - buff); //vil ikke ha men spacen. Går et hakk vidre ++cpoint; (*adult).AdultWords[i].weight = atoi(cpoint); (*adult).AdultWords[i].crc32 = crc32boitho((*adult).AdultWords[i].word); } //(*adult).AdultWords[i].word[strlen((*adult).AdultWords[i].word) -1] = '\0'; //printf("%i: -%s- %lu %i\n",i,(*adult).AdultWords[i].word,(*adult).AdultWords[i].crc32,(*adult).AdultWords[i].weight); ++i; } (*adult).adultWordnr = i; fclose(FH); qsort((*adult).AdultWords, i , sizeof(struct adultWordFormat), compare_elements_adultWord); //debug: vis alle ordene, sortert // for(y=0;y<i;y++) { // printf("%i: -%s- %lu %i\n",y,(*adult).AdultWords[y].word,(*adult).AdultWords[y].crc32,(*adult).AdultWords[y].weight); // } for(i=0;i<maxAdultWords;i++) { (*adult).adultFraser[i].adultWordCount = 0; } //AdultFraserFile if ((FH = fopen(AdultFraserVektetFile,"r")) == NULL) { perror(AdultFraserVektetFile); exit(1); } i=-1; while ((fgets(buff,sizeof(buff) -1,FH) != NULL) && (i < maxAdultWords)) { //gjør om til lite case for(x=0;x<strlen(buff);x++) { buff[x] = tolower(buff[x]); } //printf("buff %s\n",buff); if ((x=sscanf(buff,"%s %s %i\n",word1,word2,&weight))!=3) { printf("bad AdultFraserVektetFile format: %s\n",buff); } else { //printf("%i: %s, %s, %i\n",i,word1,word2,weight); //finner crc32 verdeien for første ord crc32tmp = crc32boitho(word1); //hvsi dette er første så her vi ikke noen forige å legge den til i, så vi må opprette ny //hvsi dette derimot har samme word1 som forige så legger vi det til if ((i!=-1) && (crc32tmp == (*adult).adultFraser[i].crc32)) { //printf("nr to\n"); } else { ++i; } strcpy((*adult).adultFraser[i].word,word1); (*adult).adultFraser[i].crc32 = crc32boitho(word1); (*adult).adultFraser[i].adultWord[(*adult).adultFraser[i].adultWordCount].weight = weight; strcpy((*adult).adultFraser[i].adultWord[(*adult).adultFraser[i].adultWordCount].word,word2); (*adult).adultFraser[i].adultWord[(*adult).adultFraser[i].adultWordCount].crc32 = crc32boitho(word2); if ((*adult).adultFraser[i].adultWordCount < MaxAdultWordCount -1) { ++(*adult).adultFraser[i].adultWordCount; } else { printf("MaxAdultWordCount %i for %s\n",MaxAdultWordCount,buff); } } } fclose(FH); (*adult).adultWordFrasernr = i; qsort((*adult).adultFraser, (*adult).adultWordFrasernr , sizeof(struct adultWordFraserFormat), compare_elements_AdultFraser); /* for(i=0;i<(*adult).adultWordFrasernr;i++) { printf("%i, -%s-, nr %i\n",i,(*adult).adultFraser[i].word,(*adult).adultFraser[i].adultWordCount); for(y=0;y<(*adult).adultFraser[i].adultWordCount;y++) { printf("\t %i: %s-%s: %i\n",y,(*adult).adultFraser[i].word,(*adult).adultFraser[i].adultWord[y].word,(*adult).adultFraser[i].adultWord[y].weight); } } */ }
void *issueAdd(void *arg) { int mysocfd = (int) arg; struct betaler_keywords_visninger_format { int kid; int betaler_side_id; }; struct betaler_keywords_visninger_format betaler_keywords_visninger[10]; char buff[1024]; struct timeval globalstart_time, globalend_time; unsigned int addid; char *strpointer; int siderType_ppctopNr,siderType_ppcsideNr; struct queryNodeHederFormat queryNodeHeder; char queryEscaped[MaxQueryLen*2+1]; char ppcprovider[32]; int i,n, y, net_status, showabal;; //sjekker vårt egent anonsesystem char mysql_query [2048]; static MYSQL demo_db; MYSQL_RES *mysqlres; /* To be used to fetch information into */ MYSQL_ROW mysqlrow; struct SiderHederFormat SiderHeder; struct ppcPagesFormat ppcPages[10]; struct SiderFormat *Sider; gettimeofday(&globalstart_time, NULL); if ((i=recv(mysocfd, &queryNodeHeder, sizeof(queryNodeHeder),MSG_WAITALL)) == -1) { perror("recv"); } printf("Query %s\n",queryNodeHeder.query); Sider = (struct SiderFormat *)malloc(sizeof(struct SiderFormat) * (queryNodeHeder.MaxsHits)); //setter alle sidene som sletett for (i=0;i<queryNodeHeder.MaxsHits;i++) { Sider[i].deletet = 1; } //sender svar med en gang at vi kan gjøre dette net_status = net_CanDo; if ((n=sendall(mysocfd,&net_status, sizeof(net_status))) != sizeof(net_status)) { printf("send only %i of %i\n",n,sizeof(net_status)); perror("sendall net_status"); } /********************************************************************************************/ #ifdef DEBUG printf("sending query to ppc db\n"); #endif mysql_init(&demo_db); #ifdef WITH_THREAD my_thread_init(); // kalt mysql_thread_init() i mysql 5.0 #endif //if(!mysql_real_connect(&demo_db, "www2.boitho.com", "boitho_remote", "G7J7v5L5Y7", "boitho", 3306, NULL, 0)){ if(!mysql_real_connect(&demo_db, "localhost", "boitho", "G7J7v5L5Y7", "boithoweb", 3306, NULL, 0)){ printf(mysql_error(&demo_db)); //return(1); pthread_exit((void *)1); /* exit with status */ } //escaper queryet rikit mysql_real_escape_string(&demo_db,queryEscaped,queryNodeHeder.query,strlen(queryNodeHeder.query)); sprintf(mysql_query, "select tittel,url,beskrivelse,betaler_sider.bruker_navn,betaler_keywords.betaler,betaler_keywords.kid,betaler_sider.id from betaler_keywords,betaler_sider where betaler_keywords.keyword ='%s' and betaler_keywords.betaler_side_id=betaler_sider.id order by betaler desc",queryEscaped); if(mysql_real_query(&demo_db, mysql_query, strlen(mysql_query))){ /* Make query */ printf(mysql_error(&demo_db)); //return(1); pthread_exit((void *)1); /* exit with status */ } #ifdef DEBUG printf("sending query to ppc db end\n"); #endif /********************************************************************************************/ SiderHeder.TotaltTreff = 0; int nrOfppcPages = 0; int nrOfBoithoAds = 0; //printer ut eventuelt ppc ord mysqlres=mysql_store_result(&demo_db); /* Download result from server */ while ((mysqlrow=mysql_fetch_row(mysqlres)) != NULL) { /* Get a row from the results */ //printf("\t<beskrivelse>%s</beskrivelse>\n",mysqlrow[2]); //Sider[showabal].type = siderType_ppctop; strncpy(ppcPages[nrOfppcPages].title,mysqlrow[0],sizeof(ppcPages[nrOfppcPages].title)); strncpy(ppcPages[nrOfppcPages].url,mysqlrow[1],sizeof(ppcPages[nrOfppcPages].url)); strncpy(ppcPages[nrOfppcPages].uri,mysqlrow[1],sizeof(ppcPages[nrOfppcPages].uri)); strncpy(ppcPages[nrOfppcPages].description,mysqlrow[2],sizeof(ppcPages[nrOfppcPages].description)); strncpy(ppcPages[nrOfppcPages].user,mysqlrow[3],sizeof(ppcPages[nrOfppcPages].user)); ppcPages[nrOfppcPages].thumbnail[0] = '\0'; ppcPages[nrOfppcPages].bid = atof(mysqlrow[4]); ppcPages[nrOfppcPages].keyword_id = atoi(mysqlrow[5]); ppcPages[nrOfppcPages].DocID = strtoul(mysqlrow[6], (char **)NULL, 10); ppcPages[nrOfppcPages].allrank = 10000; #ifdef DEBUG printf("aa bid %f\n",ppcPages[nrOfppcPages].bid); printf("\tUrl: %s\n",ppcPages[nrOfppcPages].url); printf("\tTitle: %s\n",ppcPages[nrOfppcPages].title); printf("keyword_id -%s-\n",mysqlrow[5]); #endif betaler_keywords_visninger[nrOfBoithoAds].kid = ppcPages[nrOfppcPages].keyword_id; betaler_keywords_visninger[nrOfBoithoAds].betaler_side_id = ppcPages[nrOfppcPages].DocID; ++nrOfppcPages; ++nrOfBoithoAds; } mysql_free_result(mysqlres); /*********************************/ printf("contry: %s\n",queryNodeHeder.GeoIPcontry); if (strcmp(queryNodeHeder.GeoIPcontry,"NO") == 0) { strcpy(ppcprovider,"hent"); //strcpy(ppcprovider,"revenuepilot"); } else { //alle språk //strcpy(ppcprovider,"revenuepilot"); //strcpy(ppcprovider,"searchboss"); } strcpy(ppcprovider,"amazon"); //temp: skrur av 3p xml feeds //getPpcAds(ppcprovider,ppcPages,&nrOfppcPages,&queryNodeHeder); //temp: Viser en mindre side da vi får problemer med siste? //nrOfppcPages--; showabal = 0; for (i=0;i<nrOfppcPages;i++) { /*********************************************/ //Sider[showabal].type = siderType_ppcside; #ifdef DEBUG printf("issue add. keyword_id %i\n",ppcPages[i].keyword_id); #endif sprintf(mysql_query, "insert into issuedadds values(%s,'%s','%f','%s',%s,'%s','%s','%s','%s','%s','%s','%s','%i','%i')", "NULL", queryEscaped, ppcPages[i].bid, ppcPages[i].uri, "NOW()", 0, ppcPages[i].user, queryNodeHeder.search_user, queryNodeHeder.userip, queryNodeHeder.HTTP_ACCEPT_LANGUAGE, queryNodeHeder.HTTP_USER_AGENT, queryNodeHeder.HTTP_REFERER, ppcPages[i].keyword_id, ppcPages[i].DocID ); #ifdef DEBUG printf("ppc user %s\naffuser %s\n",Sider[i].user,queryNodeHeder.search_user); #endif if(mysql_real_query(&demo_db, mysql_query, strlen(mysql_query))){ /* Make query */ printf("Cant insert into issuedadds: %s\nSql query vas %s\n",mysql_error(&demo_db),mysql_query); //return(1); pthread_exit((void *)1); /* exit with status */ } addid = mysql_insert_id(&demo_db); #ifdef DEBUG printf("addid %u\n",addid); #endif //sprintf(ppcPages[showabal].uri,"http://search.boitho.com/cgi-bin/addout.cgi?addid=%u&addurl=%s",addid,ppcPages[showabal].url); sprintf(ppcPages[showabal].uri,"http://bbh-001.boitho.com/cgi-bin/addout.cgi?addid=%u&addurl=%s",addid,ppcPages[showabal].url); //strcpy(Sider[i].uri,buff); /*********************************************/ if (strlen(ppcPages[i].title) == (sizeof(ppcPages[i].title) -1)) { //strcpy(Sider[showabal].title,"Title to long."); strncpy(Sider[showabal].title,ppcPages[i].title,sizeof(Sider[showabal].title) -3); strcat(Sider[showabal].title,".."); } else { strncpy(Sider[showabal].title,ppcPages[i].title,sizeof(Sider[showabal].title)); } strncpy(Sider[showabal].description,ppcPages[i].description,sizeof(Sider[showabal].description)); strncpy(Sider[showabal].url,ppcPages[i].url,sizeof(Sider[showabal].url)); strncpy(Sider[showabal].uri,ppcPages[i].uri,sizeof(Sider[showabal].uri)); strncpy(Sider[showabal].user,ppcPages[i].user,sizeof(Sider[showabal].user)); strscpy(Sider[showabal].domain,ppcPages[i].domain,sizeof(Sider[showabal].domain)); strscpy(Sider[showabal].thumbnale,ppcPages[i].thumbnail,sizeof(Sider[showabal].thumbnale)); Sider[showabal].thumbnailwidth = atol(ppcPages[i].thumbnailwidth); Sider[showabal].thumbnailheight = atol(ppcPages[i].thumbnailheight); Sider[showabal].bid = ppcPages[i].bid; Sider[showabal].iindex.allrank = ppcPages[i].allrank; #ifdef DEBUG printf("%s\t%s\t%f\n",Sider[showabal].url,Sider[showabal].title,ppcPages[i].bid); #endif ++showabal; } /*********************************/ siderType_ppctopNr = 0; siderType_ppcsideNr = 0; for(i=0;i<showabal;i++) { #ifdef DEBUG printf("uri %s\n",Sider[i].uri); #endif Sider[i].DocumentIndex.crc32 = crc32boitho(Sider[i].description); Sider[i].deletet = 0; //lager fin beskrivlse som slutter på .. isteden får bare et kappet ord, hvis beskrivlese er for lang if (strlen(Sider[i].description) >= 250) { //søker oss til siste space , eller ; og avslutter der if ((strpointer = (char *)strrchr(Sider[i].description,' ')) != NULL) { strpointer[0] = '\0'; } else if ((strpointer = (char *)strrchr(Sider[i].description,';')) != NULL) { ++strpointer; //pekeren peker på semikolonet. SKal ha det med, så må legge il en strpointer[0] = '\0'; } strncat(Sider[i].description,"..",2); } //hiliter ordet sprintf(buff,"<b>%s</b>",queryNodeHeder.query); strcasesandr(Sider[i].description,sizeof(Sider[i].description),queryNodeHeder.query,buff); //bestemmer ppc type //Sider[showabal].type = siderType_ppcside //Sider[i].type = siderType_ppctop; if ((siderType_ppctopNr < 2) && (strcasestr(Sider[i].description,queryNodeHeder.query) != 0)) { Sider[i].type = siderType_ppctop; ++siderType_ppctopNr; } else { Sider[i].type = siderType_ppcside; ++siderType_ppcsideNr; } } //legger datane in i mysql database. for(i=0;i<showabal;i++) { } gettimeofday(&globalend_time, NULL); SiderHeder.total_usecs = getTimeDifference(&globalstart_time,&globalend_time); SiderHeder.TotaltTreff = showabal; SiderHeder.showabal = showabal; SiderHeder.filtered = 0; SiderHeder.hiliteQuery[0] = '\0'; sprintf(SiderHeder.servername,"adserver.boitho.com"); //SiderHeder.queryTime = 0; if ((n=sendall(mysocfd,&SiderHeder, sizeof(SiderHeder))) != sizeof(SiderHeder)) { printf("send only %i of %i\n",n,sizeof(SiderHeder)); perror("sendall SiderHeder"); } for(i=0;i<SiderHeder.showabal;i++) { //for (i=0;i<queryNodeHeder.MaxsHits;i++) { #ifdef DEBUG printf("sending %s, deletet %i\n",Sider[i].url,Sider[i].deletet); printf("bb: -%s-\n",Sider[i].title); printf("url: -%s-\n",Sider[i].url); #endif //if (!Sider[i].deletet) { if ((n=sendall(mysocfd,&Sider[i], sizeof(struct SiderFormat))) != sizeof(struct SiderFormat)) { printf("send only %i of %i\n",n,sizeof(struct SiderFormat)); perror("sendall"); } //} } //logger alle visningene vi har hatt på egen ppc ord for (i=0;i<nrOfBoithoAds;i++) { sprintf(mysql_query, "insert DELAYED into betaler_keywords_visninger values(NULL,'%i','%i',NOW())",betaler_keywords_visninger[i].kid,betaler_keywords_visninger[i].betaler_side_id); if(mysql_real_query(&demo_db, mysql_query, strlen(mysql_query))){ /* Make query */ printf(mysql_error(&demo_db)); //return(1); pthread_exit((void *)0); /* exit with status */ } } mysql_close(&demo_db); //close(mysocfd); free(Sider); close(mysocfd); #ifdef WITH_THREAD my_thread_end(); // kalt mysql_thread_end() i mysql 5.0 pthread_exit((void *)0); /* exit with status */ #endif printf("end\n"); //return 0; }
int main (int argc, char *argv[]) { int lotNr; int i; unsigned int DocID; char text[50]; unsigned int radress; unsigned int rsize; char **Data; int Count, TokCount; unsigned short hits; unsigned long WordID; int bucket; int y; int nr; FILE *revindexFilesHa[NrOfDataDirectorys]; unsigned char lang; FILE *FH; unsigned int DocIDPlace; int *nrOfLinkWordsToDocID = malloc(sizeof(int) * NrofDocIDsInLot); for (i=0;i<NrofDocIDsInLot;i++) { //begynner på 2000 så det skal være lett og skille de visuelt fra andre hits nrOfLinkWordsToDocID[i] = 2000; } //tester for at vi har fåt hvilken lot vi skal bruke if (argc < 3) { printf("Usage: ./anchorread lotnr subname\n\n"); exit(1); } lotNr = atoi(argv[1]); char *subname = argv[2]; if ( (FH = lotOpenFileNoCasheByLotNr(lotNr,"anchors","rb", 's',subname)) == NULL) { printf("lot dont have a anchors file\n"); exit(1); } fclose(FH); revindexFilesOpenLocal(revindexFilesHa,lotNr,"Anchor","wb",subname); //int anchorGetNext (int LotNr,unsigned int *DocID,char *text,unsigned int *radress,unsigned int *rsize) while (anchorGetNext(lotNr,&DocID,text,sizeof(text),&radress,&rsize,subname) ) { DocIDPlace = (DocID - LotDocIDOfset(rLotForDOCid(DocID))); ++nrOfLinkWordsToDocID[DocIDPlace]; convert_to_lowercase((unsigned char *)text); #ifdef DEBUG if (DocID == 4999999) { printf("DocID %i, text: \"%s\", DocIDPlace %i, nrOfLinkWordsToDocID %i\n",DocID,text,DocIDPlace,nrOfLinkWordsToDocID[DocIDPlace]); } #endif if ((TokCount = split(text, " ", &Data)) == -1) { printf("canæt splitt \"%s\"\n",text); } //for (i=(TokCount-1);i>=0;i--) { i=0; while (Data[i] != NULL) { /* if (nrOfLinkWordsToDocID[DocIDPlace] > 65505) { #ifdef DEBUG if (DocID == 4999999) { printf("reach max nr of words for DocID %u. Hav %i+ words\n",DocID,nrOfLinkWordsToDocID[DocIDPlace]); } #endif break; } */ if (Data[i][0] == '\0') { #ifdef DEBUG if (DocID == 4999999) { printf("emty data element\n"); } #endif } else if (strcmp(Data[i],"www") == 0) { #ifdef DEBUG if (DocID == 4999999) { printf("www\n"); } #endif ++nrOfLinkWordsToDocID[DocIDPlace]; } else if (isStoppWord(Data[i])) { #ifdef DEBUG if (DocID == 4999999) { printf("stopword \"%s\"\n",Data[i]); } #endif //++nrOfLinkWordsToDocID[DocIDPlace]; } else { #ifdef DEBUG if (DocID == 4999999) { printf("\t\"%s\" %i\n",Data[i],nrOfLinkWordsToDocID[DocIDPlace]); } #endif WordID = crc32boitho(Data[i]); if (WordID == 0) { printf("got 0 as word id for \"%s\". Somthing may be wrong.\n",Data[i]); } bucket = WordID % NrOfDataDirectorys; if (nrOfLinkWordsToDocID[DocIDPlace] > 65535) { hits = 65535; } else { hits = nrOfLinkWordsToDocID[DocIDPlace]; } #ifdef DEBUG if (DocID == 4999999) { printf("\thits %i: \"%s\": %hu, bucket %i\n",i,Data[i],hits,bucket); } #endif if (fwrite(&DocID,sizeof(unsigned int),1,revindexFilesHa[bucket]) != 1) { perror("fwrite DocID"); } //runarb: 13 mai 2007. vi har byttet til å bruke et tal for språk. //burde da dette fra DocumentIndex hvis det finnes, men lagres ikke der //må si i IndexRes på hvordan vi gjør det der //fprintf(revindexFilesHa[bucket],"aa "); lang = 0; nr = 1; if(fwrite(&lang,sizeof(unsigned char),1,revindexFilesHa[bucket]) != 1) { perror("fwrite lang"); } if(fwrite(&WordID,sizeof(unsigned long),1,revindexFilesHa[bucket]) != 1) { perror("fwrite WordID"); } if(fwrite(&nr,sizeof(unsigned long),1,revindexFilesHa[bucket]) != 1) { perror("fwrite nr"); } if(fwrite(&hits,sizeof(unsigned short),1,revindexFilesHa[bucket]) != 1) { perror("fwrite hits"); } ++nrOfLinkWordsToDocID[DocIDPlace]; } ++i; } FreeSplitList(Data); #ifdef DEBUG if (DocID == 4999999) { printf("\n"); } #endif } free(nrOfLinkWordsToDocID); }
void wordsAdd(struct pagewordsFormatPartFormat *wordsPart, char word[],enum parsed_unit_flag puf) { int wordlLength; int wordTypeadd = 0; if (wordsPart->nr > maxWordForPage){ #ifdef DEBUG printf("mor then maxWordForPage words\n"); #endif } else { switch (puf) { case puf_none: //printf(" +p"); wordTypeadd=1000;break; case puf_title: //printf(" +title"); wordTypeadd=100; break; case puf_h1: //printf(" +h1"); wordTypeadd=500; break; case puf_h2: //printf(" +h2"); wordTypeadd=500; break; case puf_h3: //printf(" +h3"); wordTypeadd=500; break; case puf_h4: //printf(" +h4"); wordTypeadd=500; break; case puf_h5: //printf(" +h5"); wordTypeadd=500; break; case puf_h6: //printf(" +h6"); wordTypeadd=500; break; default: printf(" no catsh\n"); break; } wordlLength = strlen(word); //gjør om til små bokstaver convert_to_lowercase((unsigned char *)word); #ifdef PRESERVE_WORDS strcpy(wordsPart->words[wordsPart->nr].word,word); #endif wordsPart->words[wordsPart->nr].WordID = crc32boitho(word); #ifdef DEBUG printf(" (crc %s -> %u) ",word,wordsPart->words[wordsPart->nr].WordID); #endif wordsPart->words[wordsPart->nr].position = (wordsPart->nextPosition + wordTypeadd); // må ha en index posisjon her. Slik at vi kan finne ord før og etter. Posisjon er kodet wordsPart->words[wordsPart->nr].unsortetIndexPosition = wordsPart->nr; ++wordsPart->nextPosition; //printf("%s : %u\n",word,wordsPart->words[wordsPart->nr]); ++wordsPart->nr; } }