Esempio n. 1
0
main (int argc, char *argv[]) {
	

	int LotNr;
	char lotPath[255];

	struct ReposetoryHeaderFormat ReposetoryHeader;
	unsigned int radress;

	char htmlbuffer[524288];
	char imagebuffer[524288];
	char subname[maxSubnameLength];

	struct udfileFormat udfilepost;

	FILE *FH;

	if (argc < 3) {
		printf("Error ingen lotnr spesifisert.\n\nEksempel på bruk for å lese lot 2:\n\trread 2 www\n");
		exit(1);
	}

	LotNr = atoi(argv[1]);
	strncpy(subname,argv[2],sizeof(subname) -1);

	printf("lotnr %i\n",LotNr);

	GetFilPathForLot(lotPath,LotNr,subname);
	printf("Opning lot at: %s for %s\n",lotPath,subname);

	if ((FH = fopen(FilePath,"ab")) == NULL) {
		perror("tmpe file");
		exit(1);
	}	

	//loppergjenom alle
	int htmlbufferSize = sizeof(htmlbuffer);
	int count = 0;
	while (rGetNext(LotNr,&ReposetoryHeader,htmlbuffer,htmlbufferSize,imagebuffer,&radress,0,0,subname)) {

		if ((ReposetoryHeader.response == 200) && (ReposetoryHeader.imageSize == 0) && (ReposetoryHeader.htmlSize != 0)) {
			//printf("DocId: %i url: %s res %hi htmls %hi time %lu\n",ReposetoryHeader.DocID,ReposetoryHeader.url,ReposetoryHeader.response,ReposetoryHeader.htmlSize,ReposetoryHeader.time);
	
			//printf("################################\n%s##############################\n",htmlbuffer);

			strncpy(udfilepost.url,ReposetoryHeader.url,sizeof(udfilepost.url));
			udfilepost.DocID = ReposetoryHeader.DocID;

			fwrite(&udfilepost,sizeof(udfilepost),1,FH);
		}
		++count;
	}
	printf("Did analyse %i pages\n",count);

	fclose(FH);	
}
Esempio n. 2
0
main (int argc, char *argv[]) {
	

	int LotNr;
	char lotPath[255];
	int rank;

	struct ReposetoryHeaderFormat ReposetoryHeader;
	unsigned int radress;

	char htmlbuffer[524288];
	char imagebuffer[524288];
	char *acl;

	if (argc < 5) {
		printf("Error ingen lotnr spesifisert.\n\nEksempel på bruk for å lese lot 2:\n\trread 2 www servername minrank\n");
		exit(1);
	}

	LotNr = atoi(argv[1]);
	char *subname = argv[2];
	char *servername = argv[3];
	int minrank = atoi(argv[4]);

	printf("lotnr %i\n",LotNr);

	GetFilPathForLot(lotPath,LotNr,subname);
	printf("Opning lot at: %s for %s\n",lotPath,subname);

	popopenMemArray(servername,"www");

	//loppergjenom alle
// int rGetNext (unsigned int LotNr,struct ReposetoryHeaderFormat *ReposetoryHeader, char htmlbuffer[],
// int htmlbufferSize, char imagebuffer[], unsigned int *radress,
// unsigned int FilterTime, unsigned int FileOffset, char subname[]);

	while (rGetNext(LotNr,&ReposetoryHeader,htmlbuffer,sizeof(htmlbuffer),imagebuffer,&radress,0,0,subname,&acl)) {

		rank = popRankForDocIDMemArray(ReposetoryHeader.DocID);

		if (rank >= minrank) {
			//printf("DocId: %i url: %s res %hi htmls %hi time %lu, rank %i\n",ReposetoryHeader.DocID,ReposetoryHeader.url,ReposetoryHeader.response,ReposetoryHeader.htmlSize,ReposetoryHeader.time,rank);
			printf("%s\n",ReposetoryHeader.url);
		}
		//printf("################################\n%s##############################\n",htmlbuffer);

	}
	
	
}
Esempio n. 3
0
main (int argc, char *argv[]) {


        int LotNr;
        char lotPath[255];
	char FileName[255];

        struct ReposetoryHeaderFormat ReposetoryHeader;
        unsigned long int radress;

        char htmlbuffer[524288];
	Bytef  *htmlncompressBuffer;
	int htmlncompressBufferSize;

        char imagebuffer[524288];
	int errornr;


	FILE *IMAGEFILE;
	FILE *HTMLFILE;

        if (argc < 3) {
                printf("Error ingen lotnr spesifisert.\n\nEksempel på bruk for å dumpe lot 2:\n\trread 2 /tmp/boitho/\n");
                exit(1);
        }

        LotNr = atoi(argv[1]);

        GetFilPathForLot(&lotPath,LotNr);
        printf("Opning lot at: %s\n",lotPath);


	//henter minne
	htmlncompressBuffer = (Bytef *)malloc(2097152);

	
        //loppergjenom alle
        while (rGetNext(LotNr,&ReposetoryHeader,htmlbuffer,imagebuffer,&radress, 0, 0)) {

	printf("imageSize %i, htmlSize %i\n",ReposetoryHeader.imageSize,ReposetoryHeader.htmlSize);

	if (ReposetoryHeader.response == 200) {

		
		//skriver bilde
		sprintf(FileName,"%s%i.jpg",argv[2],ReposetoryHeader.DocID);

		if ((IMAGEFILE = fopen(FileName,"wb")) == NULL) {
                	printf("Open file image file");
                	perror(FileName);
                	exit(1);
        	}

		
		fwrite(imagebuffer,ReposetoryHeader.imageSize,1,IMAGEFILE);

		fclose(IMAGEFILE);
		

		//skriver html fil
		sprintf(FileName,"%s%i.html",argv[2],ReposetoryHeader.DocID);

                if ((HTMLFILE = fopen(FileName,"wb")) == NULL) {
                        printf("Open file image file");
                        perror(FileName);
                        exit(1);
                }

		//dekomprimerer htmlen		
		//burde ikke hardkode her
		htmlncompressBufferSize = 2097152;
		//printf("htmlncompressBufferSize: %i, htmlSize: %i\n",htmlncompressBufferSize, ReposetoryHeader.htmlSize);		
		if ((errornr = uncompress(htmlncompressBuffer,(uLongf *)&htmlncompressBufferSize,htmlbuffer,(uLong)ReposetoryHeader.htmlSize)) != 0) {
			printf("Cant uncompress: %i\n",errornr);
		}
		else {

                fwrite(htmlncompressBuffer,htmlncompressBufferSize,1,HTMLFILE);

                fclose(HTMLFILE);
		}

                printf("DocId: %i url: %s\n",ReposetoryHeader.DocID,ReposetoryHeader.url);

                //sletter litt slik at vi ser om det blir noen feil i lesingen
                ReposetoryHeader.DocID = -1;

        }

	}

	free(htmlncompressBuffer);
}
Esempio n. 4
0
int main (int argc, char *argv[]) {

        int lotNr;
	char lotServer[64];
	int pageCount;
	int i;

        unsigned int FiltetTime;
        unsigned int FileOffset;

        char htmlcompressdbuffer[524288];  //0.5 mb
        char imagebuffer[524288];  //0.5 mb

	int httpResponsCodes[nrOfHttpResponsCodes];

	struct ReposetoryHeaderFormat ReposetoryHeader;
	struct DocumentIndexFormat DocumentIndexPost;
	unsigned long int radress;
	FILE *revindexFilesHa[NrOfDataDirectorys];
	struct adultFormat adult;
	unsigned int lastIndexTime;
        if (argc < 2) {
                printf("Dette programet indekserer en lot. Gi det et lot nummer\n");
                exit(0);
        }

	for(i=0;i<nrOfHttpResponsCodes;i++) {
		httpResponsCodes[i] = 0;
	}

	lotNr = atoi(argv[1]);



	//find server based on lotnr
	lotlistLoad();
	lotlistGetServer(lotServer,lotNr);


	printf("vil index lot nr %i at %s\n",lotNr,lotServer);

	adultLoad(&adult);


	//temp: må hente dette fra slot server eller fil
	FiltetTime = 0;
	FileOffset = 0;

	pageCount = 0;

	if (0) {


		printf("will ges pages by net\n");

		revindexFilesOpenNET(revindexFilesHa);

		while (rGetNextNET(lotServer,lotNr,&ReposetoryHeader,htmlcompressdbuffer,imagebuffer,&radress,FiltetTime,FileOffset)) {
			
			global_curentDocID = ReposetoryHeader.DocID;		
			if (strchr(ReposetoryHeader.url,'?') == 0) {
				global_curentUrlIsDynamic = 0; 
			}
			else {
				global_curentUrlIsDynamic = 1;
			}

			
			handelPage(lotServer,lotNr,&ReposetoryHeader,htmlcompressdbuffer,imagebuffer,revindexFilesHa,&DocumentIndexPost,ReposetoryHeader.DocID,httpResponsCodes,&adult);
			//datta skal uansett kopieres over
			//kopierer over di data
			copyRepToDi(&DocumentIndexPost,&ReposetoryHeader);

			DocumentIndexPost.RepositoryPointer = radress;


			//skiver til DocumentIndex
			DIWriteNET(lotServer,&DocumentIndexPost,ReposetoryHeader.DocID);


			++pageCount;
		
			//temp: 
			//if(pageCount > 1000) {
			//	break;
			//}

		}

		printf("Sending pages\n");

		revindexFilesSendNET(revindexFilesHa,lotNr);

	}
	else {
		printf("Wil acess files localy\n");

		//finner siste indekseringstid
		lastIndexTime =  GetLastIndexTimeForLot(lotNr);

		//temp:
		/***********************************************************/
		//if(lastIndexTime != 0) {
		//	printf("lastIndexTime is not 0, but %i\n",lastIndexTime);
		//	exit(1);
		//}

		//FiltetTime = lastIndexTime;
		//if(lastIndexTime == 0) {
		//	printf("lastIndexTime is not 0, but %i\n",lastIndexTime);
		//	exit(1);
		//}
		/***********************************************************/
		
		revindexFilesOpenLocal(revindexFilesHa,lotNr);

		

		while (rGetNext(lotNr,&ReposetoryHeader,htmlcompressdbuffer,imagebuffer,&radress,FiltetTime,FileOffset)) {
		
			//printf("D: %lu, R: %lu\n",ReposetoryHeader.DocID, radress);

			//kan være siden er korupt, sjekker at docID gir samme lot som den vi leser
			if (rLotForDOCid(ReposetoryHeader.DocID) != lotNr) {
				printf("bad DocID %i\n",ReposetoryHeader.DocID);
			}
			//indekserer bare .no sider
			else if (strstr(ReposetoryHeader.url,".no/") == 0){
				//ikke no
			}
			else {
				global_curentDocID = ReposetoryHeader.DocID;
				if (strchr(ReposetoryHeader.url,'?') == 0) {
					global_curentUrlIsDynamic = 0; 
				}
				else {
					global_curentUrlIsDynamic = 1;
				}

				handelPage(lotServer,lotNr,&ReposetoryHeader,htmlcompressdbuffer,imagebuffer,revindexFilesHa,&DocumentIndexPost,ReposetoryHeader.DocID,httpResponsCodes,&adult);

				//printf("%s %i\n",ReposetoryHeader.url,DocumentIndexPost.AdultWeight);
				
				//datta skal uansett kopieres over
				//kopierer over di data
				copyRepToDi(&DocumentIndexPost,&ReposetoryHeader);
				DocumentIndexPost.RepositoryPointer = radress;



				//skiver til DocumentIndex
				//skriver ikke for nå: DIWrite(&DocumentIndexPost,ReposetoryHeader.DocID);
				

			++pageCount;

			}
		
			//temp: 
			//if(pageCount > 10) {
			//	break;
			//}

		}

		//skriver riktig indexstide til lotten
		//temp: setLastIndexTimeForLot(lotNr);

		// vi må ikke kopiere revindex filene da vi jobber på de lokale direkte
	}

	//skriver ut en oversikt over hvilkene http responser vi kom over
	printf("http responses:\n");
	for(i=0;i<nrOfHttpResponsCodes;i++) {
		if (httpResponsCodes[i] != 0) {
			printf("%i: %i\n",i,httpResponsCodes[i]);
		}
        }

	printf("indexed %i pages\n",pageCount);

}
Esempio n. 5
0
int main (int argc, char *argv[]) {

        int lotNr;
	char lotServer[64];
	int pageCount;
	int i;

        unsigned int FiltetTime;
        unsigned int FileOffset;

        char htmlcompressdbuffer[524288];  //0.5 mb
        char imagebuffer[524288];  //0.5 mb
	
	int httpResponsCodes[nrOfHttpResponsCodes];

	struct ReposetoryHeaderFormat ReposetoryHeader;
	struct DocumentIndexFormat DocumentIndexPost;
	unsigned long int radress;
	FILE *revindexFilesHa[NrOfDataDirectorys];
	struct adultFormat adult;
	unsigned int lastIndexTime;

        if (argc < 3) {
                printf("Dette programet indekserer en lot. Usage:\n\tIndexerLot lotNr subname\n");
                exit(0);
        }

	for(i=0;i<nrOfHttpResponsCodes;i++) {
		httpResponsCodes[i] = 0;
	}

	lotNr = atoi(argv[1]);
	strncpy(subname,argv[2],sizeof(subname) -1);


	//find server based on lotnr
	lotlistLoad();
	lotlistGetServer(lotServer,lotNr);


	printf("vil index lot nr %i at %s\n",lotNr,lotServer);

	adultLoad(&adult);

	langdetectInit();



	//temp: må hente dette fra slot server eller fil
	FiltetTime = 0;
	FileOffset = 0;

	pageCount = 0;

	if (0) {


		printf("will ges pages by net\n");

		revindexFilesOpenNET(revindexFilesHa);

		while (rGetNextNET(lotServer,lotNr,&ReposetoryHeader,htmlcompressdbuffer,imagebuffer,&radress,FiltetTime,FileOffset,subname)) {

                                global_curentDocID = ReposetoryHeader.DocID;
                                if (strchr(ReposetoryHeader.url,'?') == 0) {
                                        global_curentUrlIsDynamic = 0;
                                }
                                else {
                                        global_curentUrlIsDynamic = 1;
                                }
		
			
			
			handelPage(lotServer,lotNr,&ReposetoryHeader,htmlcompressdbuffer,imagebuffer,revindexFilesHa,&DocumentIndexPost,ReposetoryHeader.DocID,httpResponsCodes,&adult);
			//datta skal uansett kopieres over
			//kopierer over di data
			copyRepToDi(&DocumentIndexPost,&ReposetoryHeader);

			DocumentIndexPost.RepositoryPointer = radress;


			//skiver til DocumentIndex
			DIWriteNET(lotServer,&DocumentIndexPost,ReposetoryHeader.DocID,subname);


			++pageCount;
		
			//temp: 
			//if(pageCount > 999) {
			//	printf("Exeting after only %i docs\n",pageCount);
			//	break;
			//}

		}

		printf("Sending pages\n");

		revindexFilesSendNET(revindexFilesHa,lotNr);

	}
	else {
		printf("Wil acess files localy\n");

		//sjekker om vi har nokk palss
		if (!lotHasSufficientSpace(lotNr,4096,subname)) {
			printf("insufficient disk space\n");
			exit(1);
		}


		//finner siste indekseringstid
		lastIndexTime =  GetLastIndexTimeForLot(lotNr,subname);

		
		if(lastIndexTime != 0) {
			printf("lastIndexTime is not 0, but %i\n",lastIndexTime);
			exit(1);
		}
		
		
		revindexFilesOpenLocal(revindexFilesHa,lotNr,"Main","wb",subname);

		//temp:Søker til problemområdet
		//FileOffset = 334603785;		

		while (rGetNext(lotNr,&ReposetoryHeader,htmlcompressdbuffer,sizeof(htmlcompressdbuffer),imagebuffer,&radress,FiltetTime,FileOffset,subname)) {		
			//printf("D: %u, R: %lu\n",ReposetoryHeader.DocID, radress);


                                global_curentDocID = ReposetoryHeader.DocID;
                                if (strchr(ReposetoryHeader.url,'?') == 0) {
                                        global_curentUrlIsDynamic = 0;
                                }
                                else {
                                        global_curentUrlIsDynamic = 1;
                                }

								
				printf("%s\n",ReposetoryHeader.url);
				
				
				handelPage(lotServer,lotNr,&ReposetoryHeader,htmlcompressdbuffer,imagebuffer,revindexFilesHa,&DocumentIndexPost,ReposetoryHeader.DocID,httpResponsCodes,&adult);

				

				//datta skal kopieres over uanset hva som skjer
				//kopierer over di data
				copyRepToDi(&DocumentIndexPost,&ReposetoryHeader);

				DocumentIndexPost.RepositoryPointer = radress;

				//skiver til DocumentIndex
				DIWrite(&DocumentIndexPost,ReposetoryHeader.DocID,subname);
				

			++pageCount;
		
			//if(pageCount > 9999) {
			//	printf("Exeting after only %i docs\n",pageCount);
			//	//break;
			//	exit(1);
			//}

		}

		//skriver riktig indexstide til lotten
		setLastIndexTimeForLot(lotNr,httpResponsCodes,subname);

		// vi må ikke kopiere revindex filene da vi jobber på de lokale direkte
	}


	langdetectDestroy();

	printf("indexed %i pages\n\n\n",pageCount);

	return 0;
}
Esempio n. 6
0
int main (int argc, char *argv[]) {

        int lotNr;
	char lotServer[64];
	int pageCount;
	int i;

        unsigned int FiltetTime;
        unsigned int FileOffset;

        char htmlcompressdbuffer[524288];  //0.5 mb
        char imagebuffer[524288];  //0.5 mb
	
	int httpResponsCodes[nrOfHttpResponsCodes];

	struct ReposetoryHeaderFormat ReposetoryHeader;
	struct DocumentIndexFormat DocumentIndexPost;
	unsigned long int radress;
	FILE *revindexFilesHa[NrOfDataDirectorys];
	struct adultFormat adult;
	unsigned int lastIndexTime;
        if (argc < 2) {
                printf("Dette programet indekserer en lot. Gi det et lot nummer\n");
                exit(0);
        }

	for(i=0;i<nrOfHttpResponsCodes;i++) {
		httpResponsCodes[i] = 0;
	}

	lotNr = atoi(argv[1]);



	//find server based on lotnr
	lotlistLoad();
	lotlistGetServer(lotServer,lotNr);


	printf("vil index lot nr %i at %s\n",lotNr,lotServer);

                //finner siste indekseringstid
                lastIndexTime =  GetLastIndexTimeForLot(lotNr,subname);


                if(lastIndexTime == 0) {
                        printf("lastIndexTime is 0, skiping.\n");
                        exit(1);
                }

	//temp: må hente dette fra slot server eller fil
	FiltetTime = 0;
	FileOffset = 2140483648;
	//FileOffset = 1997015914;
	
	pageCount = 0;

		printf("Wil acess files localy\n");


		while (rGetNext(lotNr,&ReposetoryHeader,htmlcompressdbuffer,imagebuffer,&radress,FiltetTime,FileOffset,subname)) {


				DIRead(&DocumentIndexPost,ReposetoryHeader.DocID,subname);		

				DocumentIndexPost.RepositoryPointer = radress;

				//skiver til DocumentIndex
				DIWrite(&DocumentIndexPost,ReposetoryHeader.DocID,subname);
				

			++pageCount;
		
		}




	printf("indexed %i pages\n\n\n",pageCount);

	return 0;
}