int main (int argc, char *argv[]) { int lotNr; char lotServer[64]; int pageCount; int i; unsigned int FiltetTime; unsigned int FileOffset; char htmlcompressdbuffer[524288]; //0.5 mb char imagebuffer[524288]; //0.5 mb int httpResponsCodes[nrOfHttpResponsCodes]; struct ReposetoryHeaderFormat ReposetoryHeader; struct DocumentIndexFormat DocumentIndexPost; unsigned long int radress; FILE *revindexFilesHa[NrOfDataDirectorys]; struct adultFormat adult; unsigned int lastIndexTime; if (argc < 2) { printf("Dette programet indekserer en lot. Gi det et lot nummer\n"); exit(0); } for(i=0;i<nrOfHttpResponsCodes;i++) { httpResponsCodes[i] = 0; } lotNr = atoi(argv[1]); //find server based on lotnr lotlistLoad(); lotlistGetServer(lotServer,lotNr); printf("vil index lot nr %i at %s\n",lotNr,lotServer); adultLoad(&adult); //temp: må hente dette fra slot server eller fil FiltetTime = 0; FileOffset = 0; pageCount = 0; if (0) { printf("will ges pages by net\n"); revindexFilesOpenNET(revindexFilesHa); while (rGetNextNET(lotServer,lotNr,&ReposetoryHeader,htmlcompressdbuffer,imagebuffer,&radress,FiltetTime,FileOffset)) { global_curentDocID = ReposetoryHeader.DocID; if (strchr(ReposetoryHeader.url,'?') == 0) { global_curentUrlIsDynamic = 0; } else { global_curentUrlIsDynamic = 1; } handelPage(lotServer,lotNr,&ReposetoryHeader,htmlcompressdbuffer,imagebuffer,revindexFilesHa,&DocumentIndexPost,ReposetoryHeader.DocID,httpResponsCodes,&adult); //datta skal uansett kopieres over //kopierer over di data copyRepToDi(&DocumentIndexPost,&ReposetoryHeader); DocumentIndexPost.RepositoryPointer = radress; //skiver til DocumentIndex DIWriteNET(lotServer,&DocumentIndexPost,ReposetoryHeader.DocID); ++pageCount; //temp: //if(pageCount > 1000) { // break; //} } printf("Sending pages\n"); revindexFilesSendNET(revindexFilesHa,lotNr); } else { printf("Wil acess files localy\n"); //finner siste indekseringstid lastIndexTime = GetLastIndexTimeForLot(lotNr); //temp: /***********************************************************/ //if(lastIndexTime != 0) { // printf("lastIndexTime is not 0, but %i\n",lastIndexTime); // exit(1); //} //FiltetTime = lastIndexTime; //if(lastIndexTime == 0) { // printf("lastIndexTime is not 0, but %i\n",lastIndexTime); // exit(1); //} /***********************************************************/ revindexFilesOpenLocal(revindexFilesHa,lotNr); while (rGetNext(lotNr,&ReposetoryHeader,htmlcompressdbuffer,imagebuffer,&radress,FiltetTime,FileOffset)) { //printf("D: %lu, R: %lu\n",ReposetoryHeader.DocID, radress); //kan være siden er korupt, sjekker at docID gir samme lot som den vi leser if (rLotForDOCid(ReposetoryHeader.DocID) != lotNr) { printf("bad DocID %i\n",ReposetoryHeader.DocID); } //indekserer bare .no sider else if (strstr(ReposetoryHeader.url,".no/") == 0){ //ikke no } else { global_curentDocID = ReposetoryHeader.DocID; if (strchr(ReposetoryHeader.url,'?') == 0) { global_curentUrlIsDynamic = 0; } else { global_curentUrlIsDynamic = 1; } handelPage(lotServer,lotNr,&ReposetoryHeader,htmlcompressdbuffer,imagebuffer,revindexFilesHa,&DocumentIndexPost,ReposetoryHeader.DocID,httpResponsCodes,&adult); //printf("%s %i\n",ReposetoryHeader.url,DocumentIndexPost.AdultWeight); //datta skal uansett kopieres over //kopierer over di data copyRepToDi(&DocumentIndexPost,&ReposetoryHeader); DocumentIndexPost.RepositoryPointer = radress; //skiver til DocumentIndex //skriver ikke for nå: DIWrite(&DocumentIndexPost,ReposetoryHeader.DocID); ++pageCount; } //temp: //if(pageCount > 10) { // break; //} } //skriver riktig indexstide til lotten //temp: setLastIndexTimeForLot(lotNr); // vi må ikke kopiere revindex filene da vi jobber på de lokale direkte } //skriver ut en oversikt over hvilkene http responser vi kom over printf("http responses:\n"); for(i=0;i<nrOfHttpResponsCodes;i++) { if (httpResponsCodes[i] != 0) { printf("%i: %i\n",i,httpResponsCodes[i]); } } printf("indexed %i pages\n",pageCount); }
int main (int argc, char *argv[]) { struct DocumentIndexFormat DocumentIndexPost; int PopRankextern; int PopRankintern; int PopRanknoc; int PopRanindex; char ShortRank; FILE *FH; struct popl popextern; struct popl popintern; struct popl popnoc; struct popl popindex; uLong htmlBufferSize = 0; char *htmlBuffer = NULL; char *acl_allowbuffer = NULL; char *acl_deniedbuffer = NULL; char timebuf[26]; int optShowhtml = 0; int optShowWords = 0; int optSummary = 0; int optAnchor = 0; int optResource = 0; int optPopRank = 0; int optDelete = 0; int optAdult = 0; unsigned int DocID; char *subname; if (getenv("QUERY_STRING") == NULL) { extern char *optarg; extern int optind, opterr, optopt; char c; while ((c=getopt(argc,argv,"hwsarpdu"))!=-1) { switch (c) { case 'h': optShowhtml = 1; break; case 'u': optAdult = 1; break; case 'w': optShowWords = 1; break; case 's': optSummary = 1; break; case 'a': optAnchor = 1; break; case 'p': optPopRank = 1; break; case 'r': optResource = 1; break; case 'd': optDelete = 1; break; default: exit(1); } } --optind; #ifdef DEBUG printf("argc %i, optind %i\n",argc,optind); #endif if ((argc - optind)!= 3) { printf("Dette programet gir info om en DocID\n\n\tUsage PageInfo DocID collection\n"); exit(1); } DocID = atol(argv[1 +optind]); subname = argv[2 +optind]; } else { printf("Content-type: text/plain\n\n"); int res; // Initialize the CGI lib res = cgi_init(); // Was there an error initializing the CGI??? if (res != CGIERR_NONE) { printf("Error # %d: %s<p>\n", res, cgilib_strerror(res)); fprintf(stderr,"Cgi-lib error."); return -1; } if (cgi_getentrystr("subname") == NULL) { fprintf(stderr,"Didn't recieve any subname."); return -1; } else { subname = cgi_getentrystr("subname"); } if (cgi_getentrystr("DocID") == NULL) { fprintf(stderr,"Didn't recieve any DocID."); return -1; } else { DocID = atol( cgi_getentrystr("DocID") ); } } html_parser_init(); printf("Showing data for Collection \"%s\", DocID %u\n\n",subname,DocID); printf("Lot: %i\n",rLotForDOCid(DocID)); if (optDelete) { memset(&DocumentIndexPost,'\0',sizeof(DocumentIndexPost)); DIWrite(&DocumentIndexPost,DocID,subname,NULL); return 0; } if (DIRead_fmode(&DocumentIndexPost,DocID,subname,'s')) { printf("Url: \"%s\"\nLanguage: %s (id: %s)\nOffensive code: %hu\nDocument type: %s\nTime tested sins last good crawl: %hu\nAdult weight: %hu\nResource size: %u\nIP Address: %u\nHtml size: %i\nImage size: %i\nUser ID: %i\nCrawler version: %f\nRepository pointer: %u\n", DocumentIndexPost.Url, getLangCode2(atoi(DocumentIndexPost.Sprok)), DocumentIndexPost.Sprok, DocumentIndexPost.Offensive_code, DocumentIndexPost.Dokumenttype, DocumentIndexPost.AntallFeiledeCrawl, DocumentIndexPost.AdultWeight, DocumentIndexPost.ResourceSize, DocumentIndexPost.IPAddress, DocumentIndexPost.htmlSize2, DocumentIndexPost.imageSize, DocumentIndexPost.userID, DocumentIndexPost.clientVersion, DocumentIndexPost.RepositoryPointer); if (DocumentIndexPost.response == 200) { printf("HTTP response: %hu\n",DocumentIndexPost.response); } else { printf("HTTP response: \033[1;31m%hu\033[0m\n",DocumentIndexPost.response); } ctime_r((time_t *)&DocumentIndexPost.CrawleDato,timebuf); timebuf[24] = '\0'; printf("Last crawled time: %u\n",DocumentIndexPost.CrawleDato); printf("Last crawled time ISO: %s\n",timebuf); printf("crc32: %u\n",DocumentIndexPost.crc32); #ifdef BLACK_BOX printf("Last seen Unix: %u\n",DocumentIndexPost.lastSeen); printf("Last seen ISO: %s", ctime(&DocumentIndexPost.lastSeen)); #endif printf("Nr of out links: %u\n",(unsigned int)DocumentIndexPost.nrOfOutLinks); char *metadesc, *title, *body; if (DocumentIndexPost.SummarySize == 0) { printf("Summary: Don't have pre-parsed summery (summary size is 0)\n"); } else if (rReadSummary(DocID,&metadesc, &title, &body,DocumentIndexPost.SummaryPointer,DocumentIndexPost.SummarySize,subname)) { printf("\nSummary:\n"); printf("\tSummary pointer: %u\n\tSummary size: %hu\n",DocumentIndexPost.SummaryPointer,DocumentIndexPost.SummarySize); printf("\tTitle from summary: \"%s\"\n\tMeta description from summary: \"%s\"\n",title,metadesc); if (optSummary) { printf("Summary body\n*******************\n%s\n*******************\n\n",body); } } else { printf("Don't have pre-parsed summery\n"); } struct ReposetoryHeaderFormat ReposetoryHeader; char *url, *attributes; if (!rReadHtml(&htmlBuffer,&htmlBufferSize,DocumentIndexPost.RepositoryPointer,DocumentIndexPost.htmlSize2,DocID,subname,&ReposetoryHeader,&acl_allowbuffer,&acl_deniedbuffer,DocumentIndexPost.imageSize, &url, &attributes)) { printf("rReadHtml: did not returne true!\n"); return; } printf("Entire url: %s\n", url); #ifdef BLACK_BOX printf("acl allow raw: \"%s\"\n",acl_allowbuffer); printf("acl denied raw: \"%s\"\n",acl_deniedbuffer); printf("acl allow resolved: \"%s\"\n",aclResolv(acl_allowbuffer)); printf("acl denied resolved: \"%s\"\n",aclResolv(acl_deniedbuffer)); printf("PopRank: %d\n", ReposetoryHeader.PopRank); #endif if (optShowhtml) { printf("html uncompresed size %i\n",htmlBufferSize); printf("html buff:\n*******************************\n"); fwrite(htmlBuffer,htmlBufferSize,1,stdout); printf("\n*******************************\n\n"); } if (optShowWords) { printf("words:\n"); //run_html_parser( DocumentIndexPost.Url, htmlBuffer, htmlBufferSize, fn ); char *title, *body; html_parser_run(url,htmlBuffer, htmlBufferSize,&title, &body,fn,NULL ); } if (optResource) { char buf[500000]; printf("Resource:\n"); printf("Ptr: 0x%x Len: %x\n", DocumentIndexPost.ResourcePointer, DocumentIndexPost.ResourceSize); if (getResource(rLotForDOCid(DocID), subname, DocID, buf, sizeof(buf)) == 0) { printf("\tDid not get any resource\n"); warn(""); } else { printf("%s\n", buf); } } printf("attributes:\"%s\"\n", attributes); free(url); free(attributes); free(acl_allowbuffer); free(acl_deniedbuffer); } else { printf("Cant read post\n"); } #ifndef BLACK_BOX if (optAdult) { int httpResponsCodes[nrOfHttpResponsCodes]; //char *title; //char *body; struct adultFormat *adult; struct pagewordsFormat *pagewords = malloc(sizeof(struct pagewordsFormat)); int AdultWeight; unsigned char langnr; if ((adult = malloc(sizeof(struct adultFormat))) == NULL) { perror("malloc argstruct.adult"); exit(1); } wordsInit(pagewords); langdetectInit(); adultLoad(adult); AdultWeight -1; handelPage(pagewords,&ReposetoryHeader,htmlBuffer,htmlBufferSize,&title,&body); wordsMakeRevIndex(pagewords,adult,&AdultWeight,&langnr); printf("adult %i\n",AdultWeight); } if (optAnchor) { int anchorBufferSize; char *anchorBuffer; anchorBufferSize = anchorRead(rLotForDOCid(DocID),subname,DocID,NULL,-1); anchorBufferSize += 1; anchorBuffer = malloc(anchorBufferSize); anchorRead(rLotForDOCid(DocID),subname,DocID,anchorBuffer,anchorBufferSize); printf("#######################################\nanchors:\n%s\n#######################################\n",anchorBuffer); free(anchorBuffer); } if (optPopRank) { popopen (&popindex,"/home/boitho/config/popindex"); PopRanindex = popRankForDocID(&popindex,DocID); popclose(&popindex); printf("popindex %i\n",PopRanindex); if (popopen (&popextern,"/home/boitho/config/popextern")) { PopRankextern = popRankForDocID(&popextern,DocID); printf("PopRankextern: %i\n",PopRankextern); popclose(&popextern); } if (popopen (&popintern,"/home/boitho/config/popintern")) { PopRankintern = popRankForDocID(&popintern,DocID); printf("PopRankintern %i\n",PopRankintern); popclose(&popintern); } if (popopen (&popnoc,"/home/boitho/config/popnoc")) { PopRanknoc = popRankForDocID(&popnoc,DocID); printf("PopRanknoc %i\n",PopRanknoc); popclose(&popnoc); } if (popopen (&popindex,"/home/boitho/config/popindex")) { PopRanindex = popRankForDocID(&popindex,DocID); printf("popindex %i\n",PopRanindex); popclose(&popindex); } printf("PopRankextern: %i\nPopRankintern %i\nPopRanknoc %i\n",PopRankextern,PopRankintern,PopRanknoc); int brank; popopenMemArray_oneLot(subname,rLotForDOCid(DocID)); brank = popRankForDocIDMemArray(DocID); printf("brank %i\n",brank); //short rank if ( (FH = fopen(SHORTPOPFILE,"rb")) == NULL ) { perror("open"); } else { if ((fseek(FH,DocID* sizeof(ShortRank),SEEK_SET) == 0) && (fread(&ShortRank,sizeof(ShortRank),1,FH) != 0)) { printf("Short rank %u\n",(unsigned char)ShortRank); } else { printf("no hort rank avalibal\n"); }; fclose(FH); } } // if optPopRank #endif }
int main (int argc, char *argv[]) { int lotNr; char lotServer[64]; int pageCount; int i; unsigned int FiltetTime; unsigned int FileOffset; char htmlcompressdbuffer[524288]; //0.5 mb char imagebuffer[524288]; //0.5 mb int httpResponsCodes[nrOfHttpResponsCodes]; struct ReposetoryHeaderFormat ReposetoryHeader; struct DocumentIndexFormat DocumentIndexPost; unsigned long int radress; FILE *revindexFilesHa[NrOfDataDirectorys]; struct adultFormat adult; unsigned int lastIndexTime; if (argc < 3) { printf("Dette programet indekserer en lot. Usage:\n\tIndexerLot lotNr subname\n"); exit(0); } for(i=0;i<nrOfHttpResponsCodes;i++) { httpResponsCodes[i] = 0; } lotNr = atoi(argv[1]); strncpy(subname,argv[2],sizeof(subname) -1); //find server based on lotnr lotlistLoad(); lotlistGetServer(lotServer,lotNr); printf("vil index lot nr %i at %s\n",lotNr,lotServer); adultLoad(&adult); langdetectInit(); //temp: må hente dette fra slot server eller fil FiltetTime = 0; FileOffset = 0; pageCount = 0; if (0) { printf("will ges pages by net\n"); revindexFilesOpenNET(revindexFilesHa); while (rGetNextNET(lotServer,lotNr,&ReposetoryHeader,htmlcompressdbuffer,imagebuffer,&radress,FiltetTime,FileOffset,subname)) { global_curentDocID = ReposetoryHeader.DocID; if (strchr(ReposetoryHeader.url,'?') == 0) { global_curentUrlIsDynamic = 0; } else { global_curentUrlIsDynamic = 1; } handelPage(lotServer,lotNr,&ReposetoryHeader,htmlcompressdbuffer,imagebuffer,revindexFilesHa,&DocumentIndexPost,ReposetoryHeader.DocID,httpResponsCodes,&adult); //datta skal uansett kopieres over //kopierer over di data copyRepToDi(&DocumentIndexPost,&ReposetoryHeader); DocumentIndexPost.RepositoryPointer = radress; //skiver til DocumentIndex DIWriteNET(lotServer,&DocumentIndexPost,ReposetoryHeader.DocID,subname); ++pageCount; //temp: //if(pageCount > 999) { // printf("Exeting after only %i docs\n",pageCount); // break; //} } printf("Sending pages\n"); revindexFilesSendNET(revindexFilesHa,lotNr); } else { printf("Wil acess files localy\n"); //sjekker om vi har nokk palss if (!lotHasSufficientSpace(lotNr,4096,subname)) { printf("insufficient disk space\n"); exit(1); } //finner siste indekseringstid lastIndexTime = GetLastIndexTimeForLot(lotNr,subname); if(lastIndexTime != 0) { printf("lastIndexTime is not 0, but %i\n",lastIndexTime); exit(1); } revindexFilesOpenLocal(revindexFilesHa,lotNr,"Main","wb",subname); //temp:Søker til problemområdet //FileOffset = 334603785; while (rGetNext(lotNr,&ReposetoryHeader,htmlcompressdbuffer,sizeof(htmlcompressdbuffer),imagebuffer,&radress,FiltetTime,FileOffset,subname)) { //printf("D: %u, R: %lu\n",ReposetoryHeader.DocID, radress); global_curentDocID = ReposetoryHeader.DocID; if (strchr(ReposetoryHeader.url,'?') == 0) { global_curentUrlIsDynamic = 0; } else { global_curentUrlIsDynamic = 1; } printf("%s\n",ReposetoryHeader.url); handelPage(lotServer,lotNr,&ReposetoryHeader,htmlcompressdbuffer,imagebuffer,revindexFilesHa,&DocumentIndexPost,ReposetoryHeader.DocID,httpResponsCodes,&adult); //datta skal kopieres over uanset hva som skjer //kopierer over di data copyRepToDi(&DocumentIndexPost,&ReposetoryHeader); DocumentIndexPost.RepositoryPointer = radress; //skiver til DocumentIndex DIWrite(&DocumentIndexPost,ReposetoryHeader.DocID,subname); ++pageCount; //if(pageCount > 9999) { // printf("Exeting after only %i docs\n",pageCount); // //break; // exit(1); //} } //skriver riktig indexstide til lotten setLastIndexTimeForLot(lotNr,httpResponsCodes,subname); // vi må ikke kopiere revindex filene da vi jobber på de lokale direkte } langdetectDestroy(); printf("indexed %i pages\n\n\n",pageCount); return 0; }