Пример #1
0
void gc_reduce(struct reformat *re, int LotNr, char subname[]) {

    FILE *GCEDFH;
    int i;
    unsigned int DocID;

    //lagrer hvilkene filer vi har slettet
    GCEDFH =  lotOpenFileNoCasheByLotNr(LotNr,"gced","a", 'e',subname);

    for (i=0; i<NrofDocIDsInLot; i++) {

        if ((REN_DocumentIndex(re, i)->Url[0] != '\0') && DIS_isDeleted(REN_DocumentIndex(re, i))) {
#ifdef DEBUG
            printf("Adding url \"%s\" to gc file\n",REN_DocumentIndex(re, i)->Url);
#endif

            DocID = LotDocIDOfset(LotNr) +i;
            if (fwrite(&DocID,sizeof(DocID),1,GCEDFH) != 1) {
                perror("can't write gc file");
            }

        }
    }

    fclose(GCEDFH);


}
Пример #2
0
int main() {
	FILE *FH, *LOTFILE;
	struct stat inode;      // lager en struktur for fstat å returnere.
	int nrOfElements;
	int LotNr, DocIDPlace, oldLotNr,i,n,rank;

        if ( (FH = fopen(SHORTPOPFILE,"rb")) == NULL ) {
                perror("open");
        }

        fstat(fileno(FH),&inode);

	

        nrOfElements = inode.st_size;

        oldLotNr = -1;
        for (i=0;i<nrOfElements;i++) {
                if ((n=fread(&rank,sizeof(unsigned char),1,FH)) == -1) {
                        perror("read");
                }
                //finner lot og offset
                LotNr = rLotForDOCid(i);
                DocIDPlace = (i - LotDocIDOfset(LotNr));

                //if (lotlistIsLocal(LotNr)) {
                //        popMemArray[LotNr][DocIDPlace] = rank;
                //}

                /////////////////////////////
                //debug: vise hvilkene lot vi laster
                if (LotNr != oldLotNr) {
			if (oldLotNr != -1) {
				rSendFileByOpenHandler(LOTFILE,"Brank",oldLotNr,"w",subname);
				close(LOTFILE);
			}
			//oppret et midlertidig fil får å holde datane
			LOTFILE = tmpfile();
		
                        printf("lot %i\n",LotNr);
                        //printf("%i rank %i. Lot %i, ofset %i\n",i,(int)rank,LotNr,LotDocIDOfset(LotNr));
                }
                oldLotNr = LotNr;
                ////////////////////////////

		//søker til rikig plass og skiiver
		fseek(LOTFILE,DocIDPlace,SEEK_SET);
		fwrite(&rank,sizeof(unsigned char),1,LOTFILE);

		//printf("DocID %i, rank %i, DocIDPlace %i\n",i,rank,DocIDPlace);


        }

	rSendFileByOpenHandler(LOTFILE,"Brank",oldLotNr,"w",subname);
	close(LOTFILE);
        close(FH);

}
int DIPostAdress(unsigned int DocID) {

	int adress = -1;

	int LotNr;

	//finner lot for denne DocIDen
	LotNr = rLotForDOCid(DocID);

	#ifdef BLACK_BOX
		adress = (sizeof(struct DocumentIndexFormat) + sizeof(unsigned int))* (DocID - LotDocIDOfset(LotNr));
	#else
		adress = sizeof(struct DocumentIndexFormat) * (DocID - LotDocIDOfset(LotNr));
	#endif

	return adress;


}
Пример #4
0
int popRankForDocIDMemArray(unsigned int DocID) {
	int LotNr,DocIDPlace;

	//finner lot og offset
	LotNr = rLotForDOCid(DocID);
	DocIDPlace = (DocID - LotDocIDOfset(LotNr));

	if (popMemArray[LotNr] != 0) {
		#ifdef DEBUG
			printf("have rank %u, i:%i, y:%i\n",(unsigned int)popMemArray[LotNr][DocIDPlace],LotNr,DocIDPlace);
		#endif
		return popMemArray[LotNr][DocIDPlace];
	}
	else {
		return 0;
	}
}
Пример #5
0
void *reget(struct reformat *re, unsigned int DocID) {


	size_t position = (re->structsize * (DocID - LotDocIDOfset(re->lotNr)));
	if ((re->flags & RE_STARTS_AT_0) == RE_STARTS_AT_0) {
		#ifdef DEBUG
		printf("pso ord %u, position %u\n", position,re->structsize);
		#endif
		if (position != 0) {
			position -= re->structsize;
		}
	}
	#ifdef DEBUG	
	printf("regetp: DocID %u, position %u, lot %i\n",DocID, position, re->lotNr);
	#endif

	return reposread(re,position);;

}
Пример #6
0
int adultWeightForDocIDMemArray(int DocID) {
        int LotNr,DocIDPlace;


                //hvis vi har en negativ DocID så er noe galt
                if (DocID < 0) {
                        return -3;
                }

		//filler lot og offset
                LotNr = rLotForDOCid(DocID);
                DocIDPlace = (DocID - LotDocIDOfset(LotNr));

                if (adultWeightMemArray[LotNr] != 0) {
                        return adultWeightMemArray[LotNr][DocIDPlace];
                }
                else {
                        return 0;
                }

}
Пример #7
0
int main (int argc, char *argv[]) {

	int i;
	int lotStart,lotEnd;
	FILE *TMPTRANSFER;
	off_t offset;
	FILE *IPDBFH;
	unsigned int IPadress;
	unsigned int DocID;
	char command[128];

        if (argc < 4) {
                printf("Usage./ipdbMakeMain lotStart lotEnd ipdb\n\n");
               exit(0);
        }

        lotStart = atoi(argv[1]);
        lotEnd = atoi(argv[2]);

	TMPTRANSFER = tmpfile();

	if ((IPDBFH = fopen(argv[3],"wb")) == NULL) {
		perror(argv[3]);
		exit(1);
	}
	printf("opend ipdb %s\n",argv[3]);

	for(i=lotStart;i<(lotEnd +1);i++) {
		printf("get ipdb for %i\n",i);


		//ber om at ipdb skal lages
		sprintf(command,"/home/boitho/boithoTools/bin/ipdbBuildLotIndex %i",i);
		rComand(command,i,subname);

		//int rGetFileByOpenHandler(char source[],FILE *FILEHANDLER,int LotNr,char subname[]);
		rGetFileByOpenHandler("ipdb",TMPTRANSFER,i,subname);
		printf("rGetFileByOpenHandler end\n");
		//resetter
		fseek(TMPTRANSFER,0,SEEK_SET);
		

		//finer filofset
		int lottoffset = LotDocIDOfset(i);
		DocID = lottoffset;
	
		//if (lottoffset == 0) {
		//	offset = 0;
		//}
		//else {
			offset = ((lottoffset * sizeof(IPadress)));
		//}

		if (fseek(IPDBFH,offset,SEEK_SET) != 0) {
			perror("fseek");
			printf("cant seek to %"PRId64"\n",offset);
		}
		//printf("offset: %" PRId64 ", ftell: %ul\n",offset,ftell(IPDBFH));
		
		//kopierer over ipadresser
		while(!feof(TMPTRANSFER)) {

			fread(&IPadress,sizeof(IPadress),1,TMPTRANSFER);

			//printf("%u %u\n",DocID,IPadress);

			if (fwrite(&IPadress,sizeof(IPadress),1,IPDBFH) != 1) {
				perror("write");
			}

			++DocID;
		}

	}

	//printf("end write, ftel %ul\n",ftell(IPDBFH));

	fclose(IPDBFH);


	fclose(TMPTRANSFER);
}
Пример #8
0
int
gcdecide(int LotNr, char *subname, struct gcaoptFormat *gcaopt, time_t newest_document)
{
	int i;
	struct reformat *re;
	FILE *DOCINDEXFH;
	whisper_t whisper;


	//åpner dokument indeks får å teste at vi har en, hvis ikke kan vi bare avslutte.
        if ( (DOCINDEXFH = lotOpenFileNoCasheByLotNr(LotNr,"DocumentIndex","rb", 's',subname)) == NULL) {
		#ifdef DEBUG
                	printf("lot dont have a DocumentIndex file\n");
		#endif

                return 0;
        }
	fclose(DOCINDEXFH);

	blog(gcaopt->log,1,"Runing gc for collection \"%s\", lot nr %i",subname,LotNr);

	if((re = reopen(LotNr, sizeof(struct DocumentIndexFormat), "DocumentIndex", subname, RE_COPYONCLOSE|RE_HAVE_4_BYTES_VERSION_PREFIX)) == NULL) {
		perror("can't reopen()");
		exit(1);
	}


	whisper = gcwhisper_read(subname);

	//går gjenom alle på jakt etter de som kan slettes
	for (i=0;i<NrofDocIDsInLot;i++) {

	
		if (DIS_isDeleted(REN_DocumentIndex(re, i))) {
			continue;
		}

		#ifdef DEBUG
			#ifdef BLACK_BOX
				printf("dokument \"%s\", lastSeen: %s",
					REN_DocumentIndex(re, i)->Url,
					ctime_s(&REN_DocumentIndex(re, i)->lastSeen));
			#endif
		#endif

		#ifdef BLACK_BOX
		if ((whisper & GCWHISPER_NOTOLD) == 0 &&
		    (((gcaopt->lastSeenHack == 1) && (REN_DocumentIndex(re, i)->lastSeen == 0))
		     || ((REN_DocumentIndex(re, i)->lastSeen != 0) &&
		         (newest_document > (REN_DocumentIndex(re, i)->lastSeen + gcaopt->MaxAgeDiflastSeen))))) {


			//sletter
			DIS_delete(REN_DocumentIndex(re, i));

			//sletter dokumentet i bb spesefike ting.
			bbdocument_delete (REN_DocumentIndex(re, i)->Url, subname);

			blog(gcaopt->log,2,"dokument \"%s\" can be deleted. Last seen: %s, DocID %u",REN_DocumentIndex(re, i)->Url,ctime_s(&REN_DocumentIndex(re, i)->lastSeen),LotDocIDOfset(LotNr) +i);
			++gcaopt->gced;
		
		} 
		else {
			++gcaopt->keept;
		}
		#endif
	}


	//markerer hva vi kan slette.
	gc_reduce(re, LotNr, subname);

	reclose(re);

	//trunkerer reposetoryet.
	gcrepo(LotNr, subname);


	//vasker iindex
        struct IndekserOptFormat IndekserOpt;
        IndekserOpt.optMustBeNewerThen = 0;
        IndekserOpt.optAllowDuplicates = 0;
        IndekserOpt.optValidDocIDs = NULL;
        IndekserOpt.sequenceMode =1;
        IndekserOpt.garbareCollection = 1;

	for (i=0;i<64;i++) {
		Indekser(LotNr,"Main",i,subname,&IndekserOpt);
	}
	for (i=0;i<64;i++) {
		Indekser(LotNr,"acl_allow",i,subname,&IndekserOpt);
	}
	for (i=0;i<64;i++) {
		Indekser(LotNr,"acl_denied",i,subname,&IndekserOpt);
	}
	for (i=0;i<64;i++) {
		Indekser(LotNr,"attributes",i,subname,&IndekserOpt);
	}

	//siden vi nå har lagt til alle andringer fra rev index kan vi nå slettet gced filen også
	//Indekser_deleteGcedFile(LotNr, subname);
	lotDeleteFile("gced", LotNr, subname);


	return 0;
}
Пример #9
0
int main (int argc, char *argv[]) {

	int lotNr;
	int i;
	unsigned int DocID;
	char text[50];
	unsigned int radress;
	unsigned int rsize;
	char **Data;
  	int Count, TokCount;
	unsigned short hits;
	unsigned long WordID;
	int bucket;
	int y;
	int nr;
	FILE *revindexFilesHa[NrOfDataDirectorys];
	unsigned char lang;
	FILE *FH;
	unsigned int DocIDPlace;

	int *nrOfLinkWordsToDocID = malloc(sizeof(int) * NrofDocIDsInLot);

	for (i=0;i<NrofDocIDsInLot;i++) {
		//begynner på 2000 så det skal være lett og skille de visuelt fra andre hits
		nrOfLinkWordsToDocID[i] = 2000;
	}
        //tester for at vi har fåt hvilken lot vi skal bruke
        if (argc < 3) {
                printf("Usage: ./anchorread lotnr subname\n\n");
		exit(1);
        }

	lotNr = atoi(argv[1]);
	char *subname = argv[2];

	if ( (FH = lotOpenFileNoCasheByLotNr(lotNr,"anchors","rb", 's',subname)) == NULL) {
		printf("lot dont have a anchors file\n");
		exit(1);
	}	
	fclose(FH);

	revindexFilesOpenLocal(revindexFilesHa,lotNr,"Anchor","wb",subname);

	//int anchorGetNext (int LotNr,unsigned int *DocID,char *text,unsigned int *radress,unsigned int *rsize)
	while (anchorGetNext(lotNr,&DocID,text,sizeof(text),&radress,&rsize,subname) ) {	

			DocIDPlace = (DocID - LotDocIDOfset(rLotForDOCid(DocID)));	
			++nrOfLinkWordsToDocID[DocIDPlace];



			convert_to_lowercase((unsigned char *)text);


			#ifdef DEBUG
			if (DocID == 4999999) {
				printf("DocID %i, text: \"%s\", DocIDPlace %i, nrOfLinkWordsToDocID %i\n",DocID,text,DocIDPlace,nrOfLinkWordsToDocID[DocIDPlace]);
			}
			#endif

  			if ((TokCount = split(text, " ", &Data)) == -1) {
				printf("canæt splitt \"%s\"\n",text);
			}

			//for (i=(TokCount-1);i>=0;i--) {
			i=0;
			while (Data[i] != NULL) {

				/*
				if (nrOfLinkWordsToDocID[DocIDPlace] > 65505) {
					#ifdef DEBUG
						if (DocID == 4999999) {
							printf("reach max nr of words for DocID %u. Hav %i+ words\n",DocID,nrOfLinkWordsToDocID[DocIDPlace]);
						}
					#endif
					break;
				}
				*/

				if (Data[i][0] == '\0') {
					#ifdef DEBUG
						if (DocID == 4999999) {

							printf("emty data element\n");
						}
					#endif
				} 
				else if (strcmp(Data[i],"www") == 0) {
					#ifdef DEBUG
						if (DocID == 4999999) {
							printf("www\n");
						}
					#endif
					++nrOfLinkWordsToDocID[DocIDPlace];
				} 
				else if (isStoppWord(Data[i])) {
					#ifdef DEBUG
						if (DocID == 4999999) {
							printf("stopword \"%s\"\n",Data[i]);
						}
					#endif
					//++nrOfLinkWordsToDocID[DocIDPlace];
				}
				else {
				
					#ifdef DEBUG
						if (DocID == 4999999) {
							printf("\t\"%s\" %i\n",Data[i],nrOfLinkWordsToDocID[DocIDPlace]);
						}
					#endif


			

					WordID = crc32boitho(Data[i]);

					if (WordID == 0) {
						printf("got 0 as word id for \"%s\". Somthing may be wrong.\n",Data[i]);
					}

                			bucket = WordID % NrOfDataDirectorys;

					if (nrOfLinkWordsToDocID[DocIDPlace] > 65535) {
						hits = 65535;
					}
					else {
						hits = nrOfLinkWordsToDocID[DocIDPlace];

					}

					#ifdef DEBUG
						if (DocID == 4999999) {
	    		       				printf("\thits %i: \"%s\": %hu, bucket %i\n",i,Data[i],hits,bucket);
						}
					#endif

                
        	        		if (fwrite(&DocID,sizeof(unsigned int),1,revindexFilesHa[bucket]) != 1) {
						perror("fwrite DocID");
					}
					//runarb: 13 mai 2007. vi har byttet til å bruke et tal for språk.
					//burde da dette fra DocumentIndex hvis det finnes, men lagres ikke der
					//må si i IndexRes på hvordan vi gjør det der
        	        		//fprintf(revindexFilesHa[bucket],"aa ");
					lang = 0;
					nr = 1;
					if(fwrite(&lang,sizeof(unsigned char),1,revindexFilesHa[bucket]) != 1) {
						perror("fwrite lang");
					}


        	        		if(fwrite(&WordID,sizeof(unsigned long),1,revindexFilesHa[bucket]) != 1) {
						perror("fwrite WordID");
					}

        	        		if(fwrite(&nr,sizeof(unsigned long),1,revindexFilesHa[bucket]) != 1) {
						perror("fwrite nr");
					}


        		        	if(fwrite(&hits,sizeof(unsigned short),1,revindexFilesHa[bucket]) != 1) {
						perror("fwrite hits");
					}

			                
        	        		++nrOfLinkWordsToDocID[DocIDPlace];
				
			
				}


				++i;
			}
  			FreeSplitList(Data);


			#ifdef DEBUG
				if (DocID == 4999999) {
				printf("\n");
				}
			#endif
	}

	free(nrOfLinkWordsToDocID);

}