void gc_reduce(struct reformat *re, int LotNr, char subname[]) { FILE *GCEDFH; int i; unsigned int DocID; //lagrer hvilkene filer vi har slettet GCEDFH = lotOpenFileNoCasheByLotNr(LotNr,"gced","a", 'e',subname); for (i=0; i<NrofDocIDsInLot; i++) { if ((REN_DocumentIndex(re, i)->Url[0] != '\0') && DIS_isDeleted(REN_DocumentIndex(re, i))) { #ifdef DEBUG printf("Adding url \"%s\" to gc file\n",REN_DocumentIndex(re, i)->Url); #endif DocID = LotDocIDOfset(LotNr) +i; if (fwrite(&DocID,sizeof(DocID),1,GCEDFH) != 1) { perror("can't write gc file"); } } } fclose(GCEDFH); }
int main() { FILE *FH, *LOTFILE; struct stat inode; // lager en struktur for fstat å returnere. int nrOfElements; int LotNr, DocIDPlace, oldLotNr,i,n,rank; if ( (FH = fopen(SHORTPOPFILE,"rb")) == NULL ) { perror("open"); } fstat(fileno(FH),&inode); nrOfElements = inode.st_size; oldLotNr = -1; for (i=0;i<nrOfElements;i++) { if ((n=fread(&rank,sizeof(unsigned char),1,FH)) == -1) { perror("read"); } //finner lot og offset LotNr = rLotForDOCid(i); DocIDPlace = (i - LotDocIDOfset(LotNr)); //if (lotlistIsLocal(LotNr)) { // popMemArray[LotNr][DocIDPlace] = rank; //} ///////////////////////////// //debug: vise hvilkene lot vi laster if (LotNr != oldLotNr) { if (oldLotNr != -1) { rSendFileByOpenHandler(LOTFILE,"Brank",oldLotNr,"w",subname); close(LOTFILE); } //oppret et midlertidig fil får å holde datane LOTFILE = tmpfile(); printf("lot %i\n",LotNr); //printf("%i rank %i. Lot %i, ofset %i\n",i,(int)rank,LotNr,LotDocIDOfset(LotNr)); } oldLotNr = LotNr; //////////////////////////// //søker til rikig plass og skiiver fseek(LOTFILE,DocIDPlace,SEEK_SET); fwrite(&rank,sizeof(unsigned char),1,LOTFILE); //printf("DocID %i, rank %i, DocIDPlace %i\n",i,rank,DocIDPlace); } rSendFileByOpenHandler(LOTFILE,"Brank",oldLotNr,"w",subname); close(LOTFILE); close(FH); }
int DIPostAdress(unsigned int DocID) { int adress = -1; int LotNr; //finner lot for denne DocIDen LotNr = rLotForDOCid(DocID); #ifdef BLACK_BOX adress = (sizeof(struct DocumentIndexFormat) + sizeof(unsigned int))* (DocID - LotDocIDOfset(LotNr)); #else adress = sizeof(struct DocumentIndexFormat) * (DocID - LotDocIDOfset(LotNr)); #endif return adress; }
int popRankForDocIDMemArray(unsigned int DocID) { int LotNr,DocIDPlace; //finner lot og offset LotNr = rLotForDOCid(DocID); DocIDPlace = (DocID - LotDocIDOfset(LotNr)); if (popMemArray[LotNr] != 0) { #ifdef DEBUG printf("have rank %u, i:%i, y:%i\n",(unsigned int)popMemArray[LotNr][DocIDPlace],LotNr,DocIDPlace); #endif return popMemArray[LotNr][DocIDPlace]; } else { return 0; } }
void *reget(struct reformat *re, unsigned int DocID) { size_t position = (re->structsize * (DocID - LotDocIDOfset(re->lotNr))); if ((re->flags & RE_STARTS_AT_0) == RE_STARTS_AT_0) { #ifdef DEBUG printf("pso ord %u, position %u\n", position,re->structsize); #endif if (position != 0) { position -= re->structsize; } } #ifdef DEBUG printf("regetp: DocID %u, position %u, lot %i\n",DocID, position, re->lotNr); #endif return reposread(re,position);; }
int adultWeightForDocIDMemArray(int DocID) { int LotNr,DocIDPlace; //hvis vi har en negativ DocID så er noe galt if (DocID < 0) { return -3; } //filler lot og offset LotNr = rLotForDOCid(DocID); DocIDPlace = (DocID - LotDocIDOfset(LotNr)); if (adultWeightMemArray[LotNr] != 0) { return adultWeightMemArray[LotNr][DocIDPlace]; } else { return 0; } }
int main (int argc, char *argv[]) { int i; int lotStart,lotEnd; FILE *TMPTRANSFER; off_t offset; FILE *IPDBFH; unsigned int IPadress; unsigned int DocID; char command[128]; if (argc < 4) { printf("Usage./ipdbMakeMain lotStart lotEnd ipdb\n\n"); exit(0); } lotStart = atoi(argv[1]); lotEnd = atoi(argv[2]); TMPTRANSFER = tmpfile(); if ((IPDBFH = fopen(argv[3],"wb")) == NULL) { perror(argv[3]); exit(1); } printf("opend ipdb %s\n",argv[3]); for(i=lotStart;i<(lotEnd +1);i++) { printf("get ipdb for %i\n",i); //ber om at ipdb skal lages sprintf(command,"/home/boitho/boithoTools/bin/ipdbBuildLotIndex %i",i); rComand(command,i,subname); //int rGetFileByOpenHandler(char source[],FILE *FILEHANDLER,int LotNr,char subname[]); rGetFileByOpenHandler("ipdb",TMPTRANSFER,i,subname); printf("rGetFileByOpenHandler end\n"); //resetter fseek(TMPTRANSFER,0,SEEK_SET); //finer filofset int lottoffset = LotDocIDOfset(i); DocID = lottoffset; //if (lottoffset == 0) { // offset = 0; //} //else { offset = ((lottoffset * sizeof(IPadress))); //} if (fseek(IPDBFH,offset,SEEK_SET) != 0) { perror("fseek"); printf("cant seek to %"PRId64"\n",offset); } //printf("offset: %" PRId64 ", ftell: %ul\n",offset,ftell(IPDBFH)); //kopierer over ipadresser while(!feof(TMPTRANSFER)) { fread(&IPadress,sizeof(IPadress),1,TMPTRANSFER); //printf("%u %u\n",DocID,IPadress); if (fwrite(&IPadress,sizeof(IPadress),1,IPDBFH) != 1) { perror("write"); } ++DocID; } } //printf("end write, ftel %ul\n",ftell(IPDBFH)); fclose(IPDBFH); fclose(TMPTRANSFER); }
int gcdecide(int LotNr, char *subname, struct gcaoptFormat *gcaopt, time_t newest_document) { int i; struct reformat *re; FILE *DOCINDEXFH; whisper_t whisper; //åpner dokument indeks får å teste at vi har en, hvis ikke kan vi bare avslutte. if ( (DOCINDEXFH = lotOpenFileNoCasheByLotNr(LotNr,"DocumentIndex","rb", 's',subname)) == NULL) { #ifdef DEBUG printf("lot dont have a DocumentIndex file\n"); #endif return 0; } fclose(DOCINDEXFH); blog(gcaopt->log,1,"Runing gc for collection \"%s\", lot nr %i",subname,LotNr); if((re = reopen(LotNr, sizeof(struct DocumentIndexFormat), "DocumentIndex", subname, RE_COPYONCLOSE|RE_HAVE_4_BYTES_VERSION_PREFIX)) == NULL) { perror("can't reopen()"); exit(1); } whisper = gcwhisper_read(subname); //går gjenom alle på jakt etter de som kan slettes for (i=0;i<NrofDocIDsInLot;i++) { if (DIS_isDeleted(REN_DocumentIndex(re, i))) { continue; } #ifdef DEBUG #ifdef BLACK_BOX printf("dokument \"%s\", lastSeen: %s", REN_DocumentIndex(re, i)->Url, ctime_s(&REN_DocumentIndex(re, i)->lastSeen)); #endif #endif #ifdef BLACK_BOX if ((whisper & GCWHISPER_NOTOLD) == 0 && (((gcaopt->lastSeenHack == 1) && (REN_DocumentIndex(re, i)->lastSeen == 0)) || ((REN_DocumentIndex(re, i)->lastSeen != 0) && (newest_document > (REN_DocumentIndex(re, i)->lastSeen + gcaopt->MaxAgeDiflastSeen))))) { //sletter DIS_delete(REN_DocumentIndex(re, i)); //sletter dokumentet i bb spesefike ting. bbdocument_delete (REN_DocumentIndex(re, i)->Url, subname); blog(gcaopt->log,2,"dokument \"%s\" can be deleted. Last seen: %s, DocID %u",REN_DocumentIndex(re, i)->Url,ctime_s(&REN_DocumentIndex(re, i)->lastSeen),LotDocIDOfset(LotNr) +i); ++gcaopt->gced; } else { ++gcaopt->keept; } #endif } //markerer hva vi kan slette. gc_reduce(re, LotNr, subname); reclose(re); //trunkerer reposetoryet. gcrepo(LotNr, subname); //vasker iindex struct IndekserOptFormat IndekserOpt; IndekserOpt.optMustBeNewerThen = 0; IndekserOpt.optAllowDuplicates = 0; IndekserOpt.optValidDocIDs = NULL; IndekserOpt.sequenceMode =1; IndekserOpt.garbareCollection = 1; for (i=0;i<64;i++) { Indekser(LotNr,"Main",i,subname,&IndekserOpt); } for (i=0;i<64;i++) { Indekser(LotNr,"acl_allow",i,subname,&IndekserOpt); } for (i=0;i<64;i++) { Indekser(LotNr,"acl_denied",i,subname,&IndekserOpt); } for (i=0;i<64;i++) { Indekser(LotNr,"attributes",i,subname,&IndekserOpt); } //siden vi nå har lagt til alle andringer fra rev index kan vi nå slettet gced filen også //Indekser_deleteGcedFile(LotNr, subname); lotDeleteFile("gced", LotNr, subname); return 0; }
int main (int argc, char *argv[]) { int lotNr; int i; unsigned int DocID; char text[50]; unsigned int radress; unsigned int rsize; char **Data; int Count, TokCount; unsigned short hits; unsigned long WordID; int bucket; int y; int nr; FILE *revindexFilesHa[NrOfDataDirectorys]; unsigned char lang; FILE *FH; unsigned int DocIDPlace; int *nrOfLinkWordsToDocID = malloc(sizeof(int) * NrofDocIDsInLot); for (i=0;i<NrofDocIDsInLot;i++) { //begynner på 2000 så det skal være lett og skille de visuelt fra andre hits nrOfLinkWordsToDocID[i] = 2000; } //tester for at vi har fåt hvilken lot vi skal bruke if (argc < 3) { printf("Usage: ./anchorread lotnr subname\n\n"); exit(1); } lotNr = atoi(argv[1]); char *subname = argv[2]; if ( (FH = lotOpenFileNoCasheByLotNr(lotNr,"anchors","rb", 's',subname)) == NULL) { printf("lot dont have a anchors file\n"); exit(1); } fclose(FH); revindexFilesOpenLocal(revindexFilesHa,lotNr,"Anchor","wb",subname); //int anchorGetNext (int LotNr,unsigned int *DocID,char *text,unsigned int *radress,unsigned int *rsize) while (anchorGetNext(lotNr,&DocID,text,sizeof(text),&radress,&rsize,subname) ) { DocIDPlace = (DocID - LotDocIDOfset(rLotForDOCid(DocID))); ++nrOfLinkWordsToDocID[DocIDPlace]; convert_to_lowercase((unsigned char *)text); #ifdef DEBUG if (DocID == 4999999) { printf("DocID %i, text: \"%s\", DocIDPlace %i, nrOfLinkWordsToDocID %i\n",DocID,text,DocIDPlace,nrOfLinkWordsToDocID[DocIDPlace]); } #endif if ((TokCount = split(text, " ", &Data)) == -1) { printf("canæt splitt \"%s\"\n",text); } //for (i=(TokCount-1);i>=0;i--) { i=0; while (Data[i] != NULL) { /* if (nrOfLinkWordsToDocID[DocIDPlace] > 65505) { #ifdef DEBUG if (DocID == 4999999) { printf("reach max nr of words for DocID %u. Hav %i+ words\n",DocID,nrOfLinkWordsToDocID[DocIDPlace]); } #endif break; } */ if (Data[i][0] == '\0') { #ifdef DEBUG if (DocID == 4999999) { printf("emty data element\n"); } #endif } else if (strcmp(Data[i],"www") == 0) { #ifdef DEBUG if (DocID == 4999999) { printf("www\n"); } #endif ++nrOfLinkWordsToDocID[DocIDPlace]; } else if (isStoppWord(Data[i])) { #ifdef DEBUG if (DocID == 4999999) { printf("stopword \"%s\"\n",Data[i]); } #endif //++nrOfLinkWordsToDocID[DocIDPlace]; } else { #ifdef DEBUG if (DocID == 4999999) { printf("\t\"%s\" %i\n",Data[i],nrOfLinkWordsToDocID[DocIDPlace]); } #endif WordID = crc32boitho(Data[i]); if (WordID == 0) { printf("got 0 as word id for \"%s\". Somthing may be wrong.\n",Data[i]); } bucket = WordID % NrOfDataDirectorys; if (nrOfLinkWordsToDocID[DocIDPlace] > 65535) { hits = 65535; } else { hits = nrOfLinkWordsToDocID[DocIDPlace]; } #ifdef DEBUG if (DocID == 4999999) { printf("\thits %i: \"%s\": %hu, bucket %i\n",i,Data[i],hits,bucket); } #endif if (fwrite(&DocID,sizeof(unsigned int),1,revindexFilesHa[bucket]) != 1) { perror("fwrite DocID"); } //runarb: 13 mai 2007. vi har byttet til å bruke et tal for språk. //burde da dette fra DocumentIndex hvis det finnes, men lagres ikke der //må si i IndexRes på hvordan vi gjør det der //fprintf(revindexFilesHa[bucket],"aa "); lang = 0; nr = 1; if(fwrite(&lang,sizeof(unsigned char),1,revindexFilesHa[bucket]) != 1) { perror("fwrite lang"); } if(fwrite(&WordID,sizeof(unsigned long),1,revindexFilesHa[bucket]) != 1) { perror("fwrite WordID"); } if(fwrite(&nr,sizeof(unsigned long),1,revindexFilesHa[bucket]) != 1) { perror("fwrite nr"); } if(fwrite(&hits,sizeof(unsigned short),1,revindexFilesHa[bucket]) != 1) { perror("fwrite hits"); } ++nrOfLinkWordsToDocID[DocIDPlace]; } ++i; } FreeSplitList(Data); #ifdef DEBUG if (DocID == 4999999) { printf("\n"); } #endif } free(nrOfLinkWordsToDocID); }