main (int argc, char *argv[]) { int LotNr; char lotPath[255]; struct ReposetoryHeaderFormat ReposetoryHeader; unsigned int radress; char htmlbuffer[524288]; char imagebuffer[524288]; char subname[maxSubnameLength]; struct udfileFormat udfilepost; FILE *FH; if (argc < 3) { printf("Error ingen lotnr spesifisert.\n\nEksempel på bruk for å lese lot 2:\n\trread 2 www\n"); exit(1); } LotNr = atoi(argv[1]); strncpy(subname,argv[2],sizeof(subname) -1); printf("lotnr %i\n",LotNr); GetFilPathForLot(lotPath,LotNr,subname); printf("Opning lot at: %s for %s\n",lotPath,subname); if ((FH = fopen(FilePath,"ab")) == NULL) { perror("tmpe file"); exit(1); } //loppergjenom alle int htmlbufferSize = sizeof(htmlbuffer); int count = 0; while (rGetNext(LotNr,&ReposetoryHeader,htmlbuffer,htmlbufferSize,imagebuffer,&radress,0,0,subname)) { if ((ReposetoryHeader.response == 200) && (ReposetoryHeader.imageSize == 0) && (ReposetoryHeader.htmlSize != 0)) { //printf("DocId: %i url: %s res %hi htmls %hi time %lu\n",ReposetoryHeader.DocID,ReposetoryHeader.url,ReposetoryHeader.response,ReposetoryHeader.htmlSize,ReposetoryHeader.time); //printf("################################\n%s##############################\n",htmlbuffer); strncpy(udfilepost.url,ReposetoryHeader.url,sizeof(udfilepost.url)); udfilepost.DocID = ReposetoryHeader.DocID; fwrite(&udfilepost,sizeof(udfilepost),1,FH); } ++count; } printf("Did analyse %i pages\n",count); fclose(FH); }
main (int argc, char *argv[]) { int LotNr; char lotPath[255]; int rank; struct ReposetoryHeaderFormat ReposetoryHeader; unsigned int radress; char htmlbuffer[524288]; char imagebuffer[524288]; char *acl; if (argc < 5) { printf("Error ingen lotnr spesifisert.\n\nEksempel på bruk for å lese lot 2:\n\trread 2 www servername minrank\n"); exit(1); } LotNr = atoi(argv[1]); char *subname = argv[2]; char *servername = argv[3]; int minrank = atoi(argv[4]); printf("lotnr %i\n",LotNr); GetFilPathForLot(lotPath,LotNr,subname); printf("Opning lot at: %s for %s\n",lotPath,subname); popopenMemArray(servername,"www"); //loppergjenom alle // int rGetNext (unsigned int LotNr,struct ReposetoryHeaderFormat *ReposetoryHeader, char htmlbuffer[], // int htmlbufferSize, char imagebuffer[], unsigned int *radress, // unsigned int FilterTime, unsigned int FileOffset, char subname[]); while (rGetNext(LotNr,&ReposetoryHeader,htmlbuffer,sizeof(htmlbuffer),imagebuffer,&radress,0,0,subname,&acl)) { rank = popRankForDocIDMemArray(ReposetoryHeader.DocID); if (rank >= minrank) { //printf("DocId: %i url: %s res %hi htmls %hi time %lu, rank %i\n",ReposetoryHeader.DocID,ReposetoryHeader.url,ReposetoryHeader.response,ReposetoryHeader.htmlSize,ReposetoryHeader.time,rank); printf("%s\n",ReposetoryHeader.url); } //printf("################################\n%s##############################\n",htmlbuffer); } }
main (int argc, char *argv[]) { int LotNr; char lotPath[255]; char FileName[255]; struct ReposetoryHeaderFormat ReposetoryHeader; unsigned long int radress; char htmlbuffer[524288]; Bytef *htmlncompressBuffer; int htmlncompressBufferSize; char imagebuffer[524288]; int errornr; FILE *IMAGEFILE; FILE *HTMLFILE; if (argc < 3) { printf("Error ingen lotnr spesifisert.\n\nEksempel på bruk for å dumpe lot 2:\n\trread 2 /tmp/boitho/\n"); exit(1); } LotNr = atoi(argv[1]); GetFilPathForLot(&lotPath,LotNr); printf("Opning lot at: %s\n",lotPath); //henter minne htmlncompressBuffer = (Bytef *)malloc(2097152); //loppergjenom alle while (rGetNext(LotNr,&ReposetoryHeader,htmlbuffer,imagebuffer,&radress, 0, 0)) { printf("imageSize %i, htmlSize %i\n",ReposetoryHeader.imageSize,ReposetoryHeader.htmlSize); if (ReposetoryHeader.response == 200) { //skriver bilde sprintf(FileName,"%s%i.jpg",argv[2],ReposetoryHeader.DocID); if ((IMAGEFILE = fopen(FileName,"wb")) == NULL) { printf("Open file image file"); perror(FileName); exit(1); } fwrite(imagebuffer,ReposetoryHeader.imageSize,1,IMAGEFILE); fclose(IMAGEFILE); //skriver html fil sprintf(FileName,"%s%i.html",argv[2],ReposetoryHeader.DocID); if ((HTMLFILE = fopen(FileName,"wb")) == NULL) { printf("Open file image file"); perror(FileName); exit(1); } //dekomprimerer htmlen //burde ikke hardkode her htmlncompressBufferSize = 2097152; //printf("htmlncompressBufferSize: %i, htmlSize: %i\n",htmlncompressBufferSize, ReposetoryHeader.htmlSize); if ((errornr = uncompress(htmlncompressBuffer,(uLongf *)&htmlncompressBufferSize,htmlbuffer,(uLong)ReposetoryHeader.htmlSize)) != 0) { printf("Cant uncompress: %i\n",errornr); } else { fwrite(htmlncompressBuffer,htmlncompressBufferSize,1,HTMLFILE); fclose(HTMLFILE); } printf("DocId: %i url: %s\n",ReposetoryHeader.DocID,ReposetoryHeader.url); //sletter litt slik at vi ser om det blir noen feil i lesingen ReposetoryHeader.DocID = -1; } } free(htmlncompressBuffer); }
int main (int argc, char *argv[]) { int lotNr; char lotServer[64]; int pageCount; int i; unsigned int FiltetTime; unsigned int FileOffset; char htmlcompressdbuffer[524288]; //0.5 mb char imagebuffer[524288]; //0.5 mb int httpResponsCodes[nrOfHttpResponsCodes]; struct ReposetoryHeaderFormat ReposetoryHeader; struct DocumentIndexFormat DocumentIndexPost; unsigned long int radress; FILE *revindexFilesHa[NrOfDataDirectorys]; struct adultFormat adult; unsigned int lastIndexTime; if (argc < 2) { printf("Dette programet indekserer en lot. Gi det et lot nummer\n"); exit(0); } for(i=0;i<nrOfHttpResponsCodes;i++) { httpResponsCodes[i] = 0; } lotNr = atoi(argv[1]); //find server based on lotnr lotlistLoad(); lotlistGetServer(lotServer,lotNr); printf("vil index lot nr %i at %s\n",lotNr,lotServer); adultLoad(&adult); //temp: må hente dette fra slot server eller fil FiltetTime = 0; FileOffset = 0; pageCount = 0; if (0) { printf("will ges pages by net\n"); revindexFilesOpenNET(revindexFilesHa); while (rGetNextNET(lotServer,lotNr,&ReposetoryHeader,htmlcompressdbuffer,imagebuffer,&radress,FiltetTime,FileOffset)) { global_curentDocID = ReposetoryHeader.DocID; if (strchr(ReposetoryHeader.url,'?') == 0) { global_curentUrlIsDynamic = 0; } else { global_curentUrlIsDynamic = 1; } handelPage(lotServer,lotNr,&ReposetoryHeader,htmlcompressdbuffer,imagebuffer,revindexFilesHa,&DocumentIndexPost,ReposetoryHeader.DocID,httpResponsCodes,&adult); //datta skal uansett kopieres over //kopierer over di data copyRepToDi(&DocumentIndexPost,&ReposetoryHeader); DocumentIndexPost.RepositoryPointer = radress; //skiver til DocumentIndex DIWriteNET(lotServer,&DocumentIndexPost,ReposetoryHeader.DocID); ++pageCount; //temp: //if(pageCount > 1000) { // break; //} } printf("Sending pages\n"); revindexFilesSendNET(revindexFilesHa,lotNr); } else { printf("Wil acess files localy\n"); //finner siste indekseringstid lastIndexTime = GetLastIndexTimeForLot(lotNr); //temp: /***********************************************************/ //if(lastIndexTime != 0) { // printf("lastIndexTime is not 0, but %i\n",lastIndexTime); // exit(1); //} //FiltetTime = lastIndexTime; //if(lastIndexTime == 0) { // printf("lastIndexTime is not 0, but %i\n",lastIndexTime); // exit(1); //} /***********************************************************/ revindexFilesOpenLocal(revindexFilesHa,lotNr); while (rGetNext(lotNr,&ReposetoryHeader,htmlcompressdbuffer,imagebuffer,&radress,FiltetTime,FileOffset)) { //printf("D: %lu, R: %lu\n",ReposetoryHeader.DocID, radress); //kan være siden er korupt, sjekker at docID gir samme lot som den vi leser if (rLotForDOCid(ReposetoryHeader.DocID) != lotNr) { printf("bad DocID %i\n",ReposetoryHeader.DocID); } //indekserer bare .no sider else if (strstr(ReposetoryHeader.url,".no/") == 0){ //ikke no } else { global_curentDocID = ReposetoryHeader.DocID; if (strchr(ReposetoryHeader.url,'?') == 0) { global_curentUrlIsDynamic = 0; } else { global_curentUrlIsDynamic = 1; } handelPage(lotServer,lotNr,&ReposetoryHeader,htmlcompressdbuffer,imagebuffer,revindexFilesHa,&DocumentIndexPost,ReposetoryHeader.DocID,httpResponsCodes,&adult); //printf("%s %i\n",ReposetoryHeader.url,DocumentIndexPost.AdultWeight); //datta skal uansett kopieres over //kopierer over di data copyRepToDi(&DocumentIndexPost,&ReposetoryHeader); DocumentIndexPost.RepositoryPointer = radress; //skiver til DocumentIndex //skriver ikke for nå: DIWrite(&DocumentIndexPost,ReposetoryHeader.DocID); ++pageCount; } //temp: //if(pageCount > 10) { // break; //} } //skriver riktig indexstide til lotten //temp: setLastIndexTimeForLot(lotNr); // vi må ikke kopiere revindex filene da vi jobber på de lokale direkte } //skriver ut en oversikt over hvilkene http responser vi kom over printf("http responses:\n"); for(i=0;i<nrOfHttpResponsCodes;i++) { if (httpResponsCodes[i] != 0) { printf("%i: %i\n",i,httpResponsCodes[i]); } } printf("indexed %i pages\n",pageCount); }
int main (int argc, char *argv[]) { int lotNr; char lotServer[64]; int pageCount; int i; unsigned int FiltetTime; unsigned int FileOffset; char htmlcompressdbuffer[524288]; //0.5 mb char imagebuffer[524288]; //0.5 mb int httpResponsCodes[nrOfHttpResponsCodes]; struct ReposetoryHeaderFormat ReposetoryHeader; struct DocumentIndexFormat DocumentIndexPost; unsigned long int radress; FILE *revindexFilesHa[NrOfDataDirectorys]; struct adultFormat adult; unsigned int lastIndexTime; if (argc < 3) { printf("Dette programet indekserer en lot. Usage:\n\tIndexerLot lotNr subname\n"); exit(0); } for(i=0;i<nrOfHttpResponsCodes;i++) { httpResponsCodes[i] = 0; } lotNr = atoi(argv[1]); strncpy(subname,argv[2],sizeof(subname) -1); //find server based on lotnr lotlistLoad(); lotlistGetServer(lotServer,lotNr); printf("vil index lot nr %i at %s\n",lotNr,lotServer); adultLoad(&adult); langdetectInit(); //temp: må hente dette fra slot server eller fil FiltetTime = 0; FileOffset = 0; pageCount = 0; if (0) { printf("will ges pages by net\n"); revindexFilesOpenNET(revindexFilesHa); while (rGetNextNET(lotServer,lotNr,&ReposetoryHeader,htmlcompressdbuffer,imagebuffer,&radress,FiltetTime,FileOffset,subname)) { global_curentDocID = ReposetoryHeader.DocID; if (strchr(ReposetoryHeader.url,'?') == 0) { global_curentUrlIsDynamic = 0; } else { global_curentUrlIsDynamic = 1; } handelPage(lotServer,lotNr,&ReposetoryHeader,htmlcompressdbuffer,imagebuffer,revindexFilesHa,&DocumentIndexPost,ReposetoryHeader.DocID,httpResponsCodes,&adult); //datta skal uansett kopieres over //kopierer over di data copyRepToDi(&DocumentIndexPost,&ReposetoryHeader); DocumentIndexPost.RepositoryPointer = radress; //skiver til DocumentIndex DIWriteNET(lotServer,&DocumentIndexPost,ReposetoryHeader.DocID,subname); ++pageCount; //temp: //if(pageCount > 999) { // printf("Exeting after only %i docs\n",pageCount); // break; //} } printf("Sending pages\n"); revindexFilesSendNET(revindexFilesHa,lotNr); } else { printf("Wil acess files localy\n"); //sjekker om vi har nokk palss if (!lotHasSufficientSpace(lotNr,4096,subname)) { printf("insufficient disk space\n"); exit(1); } //finner siste indekseringstid lastIndexTime = GetLastIndexTimeForLot(lotNr,subname); if(lastIndexTime != 0) { printf("lastIndexTime is not 0, but %i\n",lastIndexTime); exit(1); } revindexFilesOpenLocal(revindexFilesHa,lotNr,"Main","wb",subname); //temp:Søker til problemområdet //FileOffset = 334603785; while (rGetNext(lotNr,&ReposetoryHeader,htmlcompressdbuffer,sizeof(htmlcompressdbuffer),imagebuffer,&radress,FiltetTime,FileOffset,subname)) { //printf("D: %u, R: %lu\n",ReposetoryHeader.DocID, radress); global_curentDocID = ReposetoryHeader.DocID; if (strchr(ReposetoryHeader.url,'?') == 0) { global_curentUrlIsDynamic = 0; } else { global_curentUrlIsDynamic = 1; } printf("%s\n",ReposetoryHeader.url); handelPage(lotServer,lotNr,&ReposetoryHeader,htmlcompressdbuffer,imagebuffer,revindexFilesHa,&DocumentIndexPost,ReposetoryHeader.DocID,httpResponsCodes,&adult); //datta skal kopieres over uanset hva som skjer //kopierer over di data copyRepToDi(&DocumentIndexPost,&ReposetoryHeader); DocumentIndexPost.RepositoryPointer = radress; //skiver til DocumentIndex DIWrite(&DocumentIndexPost,ReposetoryHeader.DocID,subname); ++pageCount; //if(pageCount > 9999) { // printf("Exeting after only %i docs\n",pageCount); // //break; // exit(1); //} } //skriver riktig indexstide til lotten setLastIndexTimeForLot(lotNr,httpResponsCodes,subname); // vi må ikke kopiere revindex filene da vi jobber på de lokale direkte } langdetectDestroy(); printf("indexed %i pages\n\n\n",pageCount); return 0; }
int main (int argc, char *argv[]) { int lotNr; char lotServer[64]; int pageCount; int i; unsigned int FiltetTime; unsigned int FileOffset; char htmlcompressdbuffer[524288]; //0.5 mb char imagebuffer[524288]; //0.5 mb int httpResponsCodes[nrOfHttpResponsCodes]; struct ReposetoryHeaderFormat ReposetoryHeader; struct DocumentIndexFormat DocumentIndexPost; unsigned long int radress; FILE *revindexFilesHa[NrOfDataDirectorys]; struct adultFormat adult; unsigned int lastIndexTime; if (argc < 2) { printf("Dette programet indekserer en lot. Gi det et lot nummer\n"); exit(0); } for(i=0;i<nrOfHttpResponsCodes;i++) { httpResponsCodes[i] = 0; } lotNr = atoi(argv[1]); //find server based on lotnr lotlistLoad(); lotlistGetServer(lotServer,lotNr); printf("vil index lot nr %i at %s\n",lotNr,lotServer); //finner siste indekseringstid lastIndexTime = GetLastIndexTimeForLot(lotNr,subname); if(lastIndexTime == 0) { printf("lastIndexTime is 0, skiping.\n"); exit(1); } //temp: må hente dette fra slot server eller fil FiltetTime = 0; FileOffset = 2140483648; //FileOffset = 1997015914; pageCount = 0; printf("Wil acess files localy\n"); while (rGetNext(lotNr,&ReposetoryHeader,htmlcompressdbuffer,imagebuffer,&radress,FiltetTime,FileOffset,subname)) { DIRead(&DocumentIndexPost,ReposetoryHeader.DocID,subname); DocumentIndexPost.RepositoryPointer = radress; //skiver til DocumentIndex DIWrite(&DocumentIndexPost,ReposetoryHeader.DocID,subname); ++pageCount; } printf("indexed %i pages\n\n\n",pageCount); return 0; }