void *generatePagesResults(void *arg) { struct thargsF * thargs = (struct thargsF *)arg; struct DocumentIndexFormat DocumentIndex; struct ReposetoryHeaderFormat ReposetoryHeader; int canDIRead = 0; int canrReadHtml = 0; unsigned int htmlBufferSize; char *htmlBuffer; if ((htmlBuffer = malloc(max_html_size)) == NULL) { perror("can't malloc"); return; } unsigned int DocID; printf("in thread\n"); while (( DocID = NexDocID(thargs) ) != 0) { //leser DI if (!DIRead_fmode(&DocumentIndex,DocID,subname,'r')) { //hvis vi av en eller annen grun ikke kunne gjøre det kalger vi printf("Can't read DI post for %u-%i\n",DocID,rLotForDOCid(DocID)); continue; } else { ++canDIRead; } printf("url: \"%s\"\n",DocumentIndex.Url); htmlBufferSize = max_html_size; if (DocumentIndex.htmlSize == 0) { } else if (rReadHtml(htmlBuffer,&htmlBufferSize,DocumentIndex.RepositoryPointer,DocumentIndex.htmlSize,DocID, subname,&ReposetoryHeader,NULL,NULL,DocumentIndex.imageSize) != 1) { printf("Can't read html post for %u-%i\n",DocID,rLotForDOCid(DocID)); continue; } else { ++canrReadHtml; } } printf("canDIRead %i\n",canDIRead); printf("canrReadHtml %i\n",canrReadHtml); }
int main() { FILE *FH, *LOTFILE; struct stat inode; // lager en struktur for fstat å returnere. int nrOfElements; int LotNr, DocIDPlace, oldLotNr,i,n,rank; if ( (FH = fopen(SHORTPOPFILE,"rb")) == NULL ) { perror("open"); } fstat(fileno(FH),&inode); nrOfElements = inode.st_size; oldLotNr = -1; for (i=0;i<nrOfElements;i++) { if ((n=fread(&rank,sizeof(unsigned char),1,FH)) == -1) { perror("read"); } //finner lot og offset LotNr = rLotForDOCid(i); DocIDPlace = (i - LotDocIDOfset(LotNr)); //if (lotlistIsLocal(LotNr)) { // popMemArray[LotNr][DocIDPlace] = rank; //} ///////////////////////////// //debug: vise hvilkene lot vi laster if (LotNr != oldLotNr) { if (oldLotNr != -1) { rSendFileByOpenHandler(LOTFILE,"Brank",oldLotNr,"w",subname); close(LOTFILE); } //oppret et midlertidig fil får å holde datane LOTFILE = tmpfile(); printf("lot %i\n",LotNr); //printf("%i rank %i. Lot %i, ofset %i\n",i,(int)rank,LotNr,LotDocIDOfset(LotNr)); } oldLotNr = LotNr; //////////////////////////// //søker til rikig plass og skiiver fseek(LOTFILE,DocIDPlace,SEEK_SET); fwrite(&rank,sizeof(unsigned char),1,LOTFILE); //printf("DocID %i, rank %i, DocIDPlace %i\n",i,rank,DocIDPlace); } rSendFileByOpenHandler(LOTFILE,"Brank",oldLotNr,"w",subname); close(LOTFILE); close(FH); }
/* Finner path for en lot fra docid */ void GetFilPathForLotByDocID(char *FilePath,int DocID,char subname[]) { int lot; lot = rLotForDOCid(DocID); GetFilPathForLot(FilePath,lot,subname); }
int DIRead_fmode (struct DocumentIndexFormat *DocumentIndexPost, int DocID,char subname[], char filemode) { FILE *file; int forReturn = 0; #ifdef DEBUG printf("DIRead_fmode(DocID=%i, subname=\"%s\")\n",DocID,subname); #endif #ifdef DISK_PROTECTOR dp_lock(rLotForDOCid(DocID)); #endif if ((file = GetFileHandler(DocID,filemode,subname, NULL)) != NULL) { if (DIRead_post_fh(DocumentIndexPost,file)) { forReturn = 1; } //hvis vi ikke har på DI_FILE_CASHE må vi lokke filen #ifndef DI_FILE_CASHE fclose(file); #endif } else { printf("can't open DocumentIndexPost for DocID %u.\n",DocID); } if ((*DocumentIndexPost).htmlSize != 0) { (*DocumentIndexPost).htmlSize2 = (*DocumentIndexPost).htmlSize; } #ifdef DISK_PROTECTOR dp_unlock(rLotForDOCid(DocID)); #endif return forReturn; }
int lotOpenFileNoCache_direct(unsigned int DocID, char *resource, char *type, char lock, char *subname) { unsigned int LotNr = rLotForDOCid(DocID); int i; char FilePath[PATH_MAX]; char File [PATH_MAX]; int fd; printf("lotOpenFileNoCache_direct(subname: \"%s\", resource %s)\n",subname,resource); GetFilPathForLot(FilePath,LotNr,subname); strcpy(File,FilePath); strncat(File,resource,PATH_MAX); //var 128 #ifdef DEBUG printf("lotOpenFileNoCasheByLotNr: opening file \"%s\" for %s\n",File,type); #endif //hvis dette er lesing så hjelper det ikke og prøve å opprette path. Filen vil fortsatt ikke finnes if ((strcmp(type,"rb") == 0) || (strcmp(type,"r") == 0)) { if ((fd = open64(File, O_RDONLY|O_DIRECT|O_LARGEFILE)) == -1) { warn("open64: %d", fd); #ifdef DEBUG perror(File); #endif return -1; } } else { errx(1, "We can only open this for reading right now"); } #ifdef DEBUG printf("lotOpenFile: tryint to obtain lock \"%c\"\n",lock); #endif //honterer låsning if (lock == 'e') { //skal vi ha flock64() her ? flock(fd, LOCK_EX); } else if (lock == 's') { flock(fd, LOCK_SH); } #ifdef DEBUG printf("lotOpenFile: lock obtained\n"); #endif #ifdef DEBUG printf("lotOpenFileNoCasheByLotNr: finished\n"); #endif return fd; }
int DIRead_fh(struct DocumentIndexFormat *DocumentIndexPost, int DocID,char subname[], FILE *file) { int forReturn = 0; if (file == NULL) { #ifdef DEBUG printf("DIRead_fh: file isent open.\n"); #endif forReturn = DIRead_fmode(DocumentIndexPost,DocID,subname,'r'); } else { #ifdef DISK_PROTECTOR dp_lock(rLotForDOCid(DocID)); #endif //søker til riktig post if (fseek(file,DIPostAdress(DocID),0) != 0) { perror("Can't seek"); exit(1); } if (DIRead_post_fh(DocumentIndexPost,file)) { forReturn = 1; } #ifdef DISK_PROTECTOR dp_unlock(rLotForDOCid(DocID)); #endif } if ((*DocumentIndexPost).htmlSize != 0) { (*DocumentIndexPost).htmlSize2 = (*DocumentIndexPost).htmlSize; } return forReturn; }
//fjerner sider med samme domene int filterSameDomain(int showabal,struct SiderFormat *CurentSider, struct SiderFormat *Sider) { int i; int count = 0; char domainCuren[65]; char domainOther[65]; //filtrerer ikke sider vi ikke har noe domne for. Typisk ppc anonser som peger til out.cfi side på samme domene if ((*CurentSider).domain[0] == '\0') { return 0; #ifdef DEBUG printf("Warn: domain is blank, wont try to filter it.\n"); #endif } for (i=0;i<showabal;i++) { if (!Sider[i].deletet) { if (strcmp((*CurentSider).domain,Sider[i].domain) == 0) { #ifdef DEBUG if (count < 2) { printf("domain is the same. Urls Url \"%s\" (domain \"%s\", DociD %u-%i, DomainID %ho) == \"%s\" (domain \"%s\", DocID %u-%i, DomainID %ho)\n", Sider[i].DocumentIndex.Url,Sider[i].domain,Sider[i].iindex.DocID,rLotForDOCid(Sider[i].iindex.DocID),Sider[i].DomainID, (*CurentSider).DocumentIndex.Url,(*CurentSider).domain,(*CurentSider).iindex.DocID,rLotForDOCid((*CurentSider).iindex.DocID),(*CurentSider).DomainID); } #endif //printf("domain is the same. %s == %s\n",(*CurentSider).domain,Sider[i].domain); //runarb: 14.11.2007: hva gjør linjen nedenfor??? //(*CurentSider).posisjon = Sider[i].posisjon; ++count; } } } #ifdef DEBUG printf("have a total of %i from this domain\n",count); #endif if (count < 2) { return 0; } else { return 1; } }
//gir ful path for et bilde fra DocID void GetFilPathForThumbnaleByDocID(char *FileName,int DocID,char subname[]) { int LotNr; int ImageBucket; ImageBucket = fmod(DocID,512); //finner path LotNr = rLotForDOCid(DocID); GetFilPathForLot(FileName,LotNr,subname); sprintf(FileName,"%simages/%i/%i.jpg",FileName,ImageBucket,DocID); }
int popRankForDocIDMemArray(unsigned int DocID) { int LotNr,DocIDPlace; //finner lot og offset LotNr = rLotForDOCid(DocID); DocIDPlace = (DocID - LotDocIDOfset(LotNr)); if (popMemArray[LotNr] != 0) { #ifdef DEBUG printf("have rank %u, i:%i, y:%i\n",(unsigned int)popMemArray[LotNr][DocIDPlace],LotNr,DocIDPlace); #endif return popMemArray[LotNr][DocIDPlace]; } else { return 0; } }
int DIPostAdress(unsigned int DocID) { int adress = -1; int LotNr; //finner lot for denne DocIDen LotNr = rLotForDOCid(DocID); #ifdef BLACK_BOX adress = (sizeof(struct DocumentIndexFormat) + sizeof(unsigned int))* (DocID - LotDocIDOfset(LotNr)); #else adress = sizeof(struct DocumentIndexFormat) * (DocID - LotDocIDOfset(LotNr)); #endif return adress; }
int adultWeightForDocIDMemArray(int DocID) { int LotNr,DocIDPlace; //hvis vi har en negativ DocID så er noe galt if (DocID < 0) { return -3; } //filler lot og offset LotNr = rLotForDOCid(DocID); DocIDPlace = (DocID - LotDocIDOfset(LotNr)); if (adultWeightMemArray[LotNr] != 0) { return adultWeightMemArray[LotNr][DocIDPlace]; } else { return 0; } }
int main (int argc, char *argv[]) { FILE *UPDATEFILE; struct anchorfileFormat anchorfileData; //tester for at vi har fåt hvilken fil vi skal bruke if (argc < 2) { printf("Usage: ./addanchors anchorfile\n\n\tanchorfile, fil med tekster på linker\n\n"); exit(1); } if ((UPDATEFILE = fopen(argv[1],"rb")) == NULL) { printf("Cant read anchorfile "); perror(argv[1]); exit(1); } lotlistLoad(); void lotlistMarkLocals(char server[]); while(!feof(UPDATEFILE)) { fread(&anchorfileData,sizeof(struct anchorfileFormat),1,UPDATEFILE); //printf("%i : %s\n",anchorfileData.DocID,anchorfileData.text); //sjekker om dette er en lokal lot //temp: utestet: if (lotlistIsLocal(rLotForDOCid(anchorfileData.DocID))) { anchoradd(anchorfileData.DocID,anchorfileData.text,sizeof(anchorfileData.text)); } else { printf("lot is not locale"); } } fclose(UPDATEFILE); }
void disp_out_opensearch(int total_res, struct SiderFormat *results, struct queryNodeHederFormat *queryNodeHeder, int num_servers, int start, int res_per_page, char *query_escaped) { int i, x; printf("<?xml version=\"1.0\" encoding=\"UTF-8\" ?>\n"); printf("<rss version=\"2.0\" xmlns:opensearch=\"http://a9.com/-/spec/opensearch/1.1/\" xmlns:atom=\"http://www.w3.org/2005/Atom\">\n"); printf(" <channel>\n"); printf(" <title>%s - Searchdaimon results</title>\n", query_escaped); printf(" <description>Results for \"%s\".</description>\n", query_escaped); printf(" <opensearch:totalResults>%i</opensearch:totalResults>\n", total_res); printf(" <opensearch:startIndex>%i</opensearch:startIndex>\n", start); printf(" <opensearch:itemsPerPage>%i</opensearch:itemsPerPage>\n", res_per_page); printf(" <atom:link rel=\"search\" type=\"application/opensearchdescription+xml\" href=\"http://%s/webclient/opensearchdescription.xml\"/>\n", getenv("HTTP_HOST")); i = res_per_page * (start -1); x = i; while ((x<(res_per_page * start)) && ( x < total_res) && (i < (queryNodeHeder->MaxsHits * num_servers))) { if (!results[i].deletet) { printf("<item>\n"); printf("\t<docid>%i-%i</docid>\n",results[i].iindex.DocID, rLotForDOCid(results[i].iindex.DocID)); printf("\t<title><![CDATA[%s]]></title>\n", results[i].title); printf("\t<link><![CDATA[%s]]></link>\n", results[i].url); printf("\t<description>%s</description>\n", results[i].description); printf("</item>\n"); //teller bare normale sider if (results[i].type == siderType_normal) { ++x; } } ++i; } printf(" </channel>\n</rss>\n"); }
int main (int argc, char *argv[]) { int lotNr; char lotServer[64]; int pageCount; int i; unsigned int FiltetTime; unsigned int FileOffset; char htmlcompressdbuffer[524288]; //0.5 mb char imagebuffer[524288]; //0.5 mb int httpResponsCodes[nrOfHttpResponsCodes]; struct ReposetoryHeaderFormat ReposetoryHeader; struct DocumentIndexFormat DocumentIndexPost; unsigned long int radress; FILE *revindexFilesHa[NrOfDataDirectorys]; struct adultFormat adult; unsigned int lastIndexTime; if (argc < 2) { printf("Dette programet indekserer en lot. Gi det et lot nummer\n"); exit(0); } for(i=0;i<nrOfHttpResponsCodes;i++) { httpResponsCodes[i] = 0; } lotNr = atoi(argv[1]); //find server based on lotnr lotlistLoad(); lotlistGetServer(lotServer,lotNr); printf("vil index lot nr %i at %s\n",lotNr,lotServer); adultLoad(&adult); //temp: må hente dette fra slot server eller fil FiltetTime = 0; FileOffset = 0; pageCount = 0; if (0) { printf("will ges pages by net\n"); revindexFilesOpenNET(revindexFilesHa); while (rGetNextNET(lotServer,lotNr,&ReposetoryHeader,htmlcompressdbuffer,imagebuffer,&radress,FiltetTime,FileOffset)) { global_curentDocID = ReposetoryHeader.DocID; if (strchr(ReposetoryHeader.url,'?') == 0) { global_curentUrlIsDynamic = 0; } else { global_curentUrlIsDynamic = 1; } handelPage(lotServer,lotNr,&ReposetoryHeader,htmlcompressdbuffer,imagebuffer,revindexFilesHa,&DocumentIndexPost,ReposetoryHeader.DocID,httpResponsCodes,&adult); //datta skal uansett kopieres over //kopierer over di data copyRepToDi(&DocumentIndexPost,&ReposetoryHeader); DocumentIndexPost.RepositoryPointer = radress; //skiver til DocumentIndex DIWriteNET(lotServer,&DocumentIndexPost,ReposetoryHeader.DocID); ++pageCount; //temp: //if(pageCount > 1000) { // break; //} } printf("Sending pages\n"); revindexFilesSendNET(revindexFilesHa,lotNr); } else { printf("Wil acess files localy\n"); //finner siste indekseringstid lastIndexTime = GetLastIndexTimeForLot(lotNr); //temp: /***********************************************************/ //if(lastIndexTime != 0) { // printf("lastIndexTime is not 0, but %i\n",lastIndexTime); // exit(1); //} //FiltetTime = lastIndexTime; //if(lastIndexTime == 0) { // printf("lastIndexTime is not 0, but %i\n",lastIndexTime); // exit(1); //} /***********************************************************/ revindexFilesOpenLocal(revindexFilesHa,lotNr); while (rGetNext(lotNr,&ReposetoryHeader,htmlcompressdbuffer,imagebuffer,&radress,FiltetTime,FileOffset)) { //printf("D: %lu, R: %lu\n",ReposetoryHeader.DocID, radress); //kan være siden er korupt, sjekker at docID gir samme lot som den vi leser if (rLotForDOCid(ReposetoryHeader.DocID) != lotNr) { printf("bad DocID %i\n",ReposetoryHeader.DocID); } //indekserer bare .no sider else if (strstr(ReposetoryHeader.url,".no/") == 0){ //ikke no } else { global_curentDocID = ReposetoryHeader.DocID; if (strchr(ReposetoryHeader.url,'?') == 0) { global_curentUrlIsDynamic = 0; } else { global_curentUrlIsDynamic = 1; } handelPage(lotServer,lotNr,&ReposetoryHeader,htmlcompressdbuffer,imagebuffer,revindexFilesHa,&DocumentIndexPost,ReposetoryHeader.DocID,httpResponsCodes,&adult); //printf("%s %i\n",ReposetoryHeader.url,DocumentIndexPost.AdultWeight); //datta skal uansett kopieres over //kopierer over di data copyRepToDi(&DocumentIndexPost,&ReposetoryHeader); DocumentIndexPost.RepositoryPointer = radress; //skiver til DocumentIndex //skriver ikke for nå: DIWrite(&DocumentIndexPost,ReposetoryHeader.DocID); ++pageCount; } //temp: //if(pageCount > 10) { // break; //} } //skriver riktig indexstide til lotten //temp: setLastIndexTimeForLot(lotNr); // vi må ikke kopiere revindex filene da vi jobber på de lokale direkte } //skriver ut en oversikt over hvilkene http responser vi kom over printf("http responses:\n"); for(i=0;i<nrOfHttpResponsCodes;i++) { if (httpResponsCodes[i] != 0) { printf("%i: %i\n",i,httpResponsCodes[i]); } } printf("indexed %i pages\n",pageCount); }
void connectHandler(int socket) { struct packedHedderFormat packedHedder; int isAuthenticated = 0; char tkeyForTest[32]; int i,n; int intrespons; int count = 0; container *attrkeys = NULL; #ifdef DEBUG_TIME struct timeval start_time, end_time; struct timeval tot_start_time, tot_end_time; gettimeofday(&tot_start_time, NULL); #endif ionice_benice(); while ((i=recv(socket, &packedHedder, sizeof(struct packedHedderFormat),MSG_WAITALL)) > 0) { #ifdef DEBUG printf("size is: %i\nversion: %i\ncommand: %i\n",packedHedder.size,packedHedder.version,packedHedder.command); #endif packedHedder.size = packedHedder.size - sizeof(packedHedder); if (attrkeys == NULL) { attrkeys = ropen(); } if (packedHedder.command == bbc_askToAuthenticate) { if ((i=recv(socket, tkeyForTest, sizeof(tkeyForTest),MSG_WAITALL)) == -1) { perror("Cant read tkeyForTest"); exit(1); } if (1) { printf("authenticated\n"); intrespons = bbc_authenticate_ok; bbdocument_init(NULL); isAuthenticated = 1; } else { printf("authenticate faild\n"); intrespons = bbc_authenticate_feiled; } if ((n=sendall(socket, &intrespons, sizeof(intrespons))) == -1) { perror("Cant recv filerest"); exit(1); } } else { if (!isAuthenticated) { printf("user not autentikated\n"); exit(1); } if (packedHedder.command == bbc_docadd) { #ifdef DEBUG printf("bbc_docadd\n"); #endif char *subname,*documenturi,*documenttype,*document,*acl_allow,*acl_denied,*title,*doctype; char *attributes; int dokument_size; unsigned int lastmodified; #ifdef DEBUG_TIME gettimeofday(&start_time, NULL); #endif //subname if ((i=recvall(socket, &intrespons, sizeof(intrespons))) == 0) { perror("Cant read intrespons"); exit(1); } subname = malloc(intrespons +1); if ((i=recvall(socket, subname, intrespons)) == 0) { perror("Cant read subname"); exit(1); } //documenturi if ((i=recvall(socket, &intrespons, sizeof(intrespons))) == 0) { perror("Cant read intrespons"); exit(1); } documenturi = malloc(intrespons +1); if ((i=recvall(socket, documenturi, intrespons)) == 0) { perror("Cant read documenturi"); exit(1); } //documenttype if ((i=recvall(socket, &intrespons, sizeof(intrespons))) == 0) { perror("Cant read intrespons"); exit(1); } documenttype = malloc(intrespons +1); if ((i=recvall(socket, documenttype, intrespons)) == 0) { perror("Cant read documenttype"); exit(1); } //document //dokument_size if ((i=recvall(socket, &dokument_size, sizeof(dokument_size))) == 0) { perror("Cant read dokument_size"); exit(1); } document = malloc(dokument_size +1); if (dokument_size == 0) { document[0] = '\0'; } else { if ((i=recvall(socket, document, dokument_size)) == 0) { fprintf(stderr,"Can't read document of size %i\n",dokument_size); perror("recvall"); exit(1); } } //lastmodified if ((i=recvall(socket, &lastmodified, sizeof(lastmodified))) == 0) { perror("Cant read lastmodified"); exit(1); } //acl_allow if ((i=recvall(socket, &intrespons, sizeof(intrespons))) == 0) { perror("Cant read intrespons"); exit(1); } acl_allow = malloc(intrespons +1); if ((i=recvall(socket, acl_allow, intrespons)) == 0) { perror("Cant read acl_allow"); exit(1); } //acl_denied if ((i=recvall(socket, &intrespons, sizeof(intrespons))) == 0) { perror("Cant read intrespons"); exit(1); } acl_denied = malloc(intrespons +1); if ((i=recvall(socket, acl_denied, intrespons)) == 0) { perror("Cant read acl_denied"); exit(1); } //title if ((i=recvall(socket, &intrespons, sizeof(intrespons))) == 0) { perror("Cant read intrespons"); exit(1); } title = malloc(intrespons +1); if ((i=recvall(socket, title, intrespons)) == 0) { perror("Cant read title"); exit(1); } //doctype if ((i=recvall(socket, &intrespons, sizeof(intrespons))) == 0) { perror("Cant read intrespons"); exit(1); } doctype = malloc(intrespons +1); if ((i=recvall(socket, doctype, intrespons)) == 0) { perror("Cant read doctype"); exit(1); } // Attribute list if ((i = recvall(socket, &intrespons, sizeof(intrespons))) == 0) err(1, "Can't receive attribute list len"); attributes = malloc(intrespons +1); if ((i=recvall(socket, attributes, intrespons)) == 0) err(1, "Can't receive attribute list"); #ifdef DEBUG_TIME gettimeofday(&end_time, NULL); printf("Time debug: bbdn_docadd recv data time: %f\n",getTimeDifference(&start_time, &end_time)); #endif printf("\n"); printf("########################################################\n"); printf("Url: %s\n",documenturi); printf("got subname \"%s\": title \"%s\". Nr %i, dokument_size %i attrib: %s\n",subname,title,count,dokument_size, attributes); printf("########################################################\n"); printf("calling bbdocument_add():\n"); #ifdef DEBUG_TIME gettimeofday(&start_time, NULL); #endif intrespons = bbdocument_add(subname,documenturi,documenttype,document,dokument_size,lastmodified,acl_allow,acl_denied,title,doctype, attributes, attrkeys); printf(":bbdocument_add end\n"); printf("########################################################\n"); #ifdef DEBUG_TIME gettimeofday(&end_time, NULL); printf("Time debug: bbdn_docadd runing bbdocument_add() time: %f\n",getTimeDifference(&start_time, &end_time)); #endif free(subname); free(documenturi); free(documenttype); free(document); free(acl_allow); free(acl_denied); free(title); free(doctype); free(attributes); // send status if ((n=sendall(socket, &intrespons, sizeof(intrespons))) == -1) { perror("Cant recv filerest"); exit(1); } } else if (packedHedder.command == bbc_opencollection) { char *subname; char path[PATH_MAX]; printf("open collection\n"); if ((i=recv(socket, &intrespons, sizeof(intrespons),MSG_WAITALL)) == -1) err(1, "Cant read intrespons"); subname = malloc(intrespons +1); if ((i=recv(socket, subname, intrespons,MSG_WAITALL)) == -1) err(1, "Cant read subname"); GetFilPathForLot(path, 1, subname); strcat(path, "fullyCrawled"); unlink(path); free(subname); } else if (packedHedder.command == bbc_closecollection) { printf("closecollection\n"); char *subname; //subname if ((i=recv(socket, &intrespons, sizeof(intrespons),MSG_WAITALL)) == -1) { perror("Cant read intrespons"); exit(1); } subname = malloc(intrespons +1); if ((i=recv(socket, subname, intrespons,MSG_WAITALL)) == -1) { perror("Cant read subname"); exit(1); } bbdocument_close(attrkeys); attrkeys = NULL; //toDo må bruke subname, og C ikke perl her printf("cleanin lots start\n"); char command[PATH_MAX]; snprintf(command,sizeof(command),"perl %s -l -s \"%s\"",bfile("perl/cleanLots.pl"),subname); printf("running \"%s\"\n",command); intrespons = system(command); printf("cleanin lots end\n"); // legger subnamet til listen over ventene subnavn, og huper searchd. lot_recache_collection(subname); /* We are done crawling */ { int fd = lotOpenFileNoCasheByLotNrl(1, "fullyCrawled", ">>", '\0', subname); if (fd == -1) { warn("Unable to write fullyCrawled file"); } else { close(fd); } } free(subname); if ((n=sendall(socket, &intrespons, sizeof(intrespons))) == -1) { perror("Cant recv filerest"); exit(1); } } else if (packedHedder.command == bbc_deleteuri) { printf("deleteuri\n"); char *subname, *uri; //subname if ((i=recv(socket, &intrespons, sizeof(intrespons),MSG_WAITALL)) == -1) { perror("Cant read intrespons"); exit(1); } subname = malloc(intrespons +1); if ((i=recv(socket, subname, intrespons,MSG_WAITALL)) == -1) { perror("Cant read subname"); exit(1); } subname[intrespons] = '\0'; if ((i=recv(socket, &intrespons, sizeof(intrespons),MSG_WAITALL)) == -1) { perror("Cant read intrespons"); exit(1); } uri = malloc(intrespons +1); if ((i=recv(socket, uri, intrespons,MSG_WAITALL)) == -1) { perror("Cant read uri"); exit(1); } uri[intrespons] = '\0'; printf("going to delete: %s from %s\n", uri, subname); /* Add docid to the gced file */ { FILE *fh; unsigned int DocID, lastmodified; unsigned int lotNr; int err = 0; if (uriindex_get(uri, &DocID, &lastmodified, subname) == 0) { fprintf(stderr,"Unable to get uri info. uri=\"%s\",subname=\"%s\".",uri,subname); perror("Unable to get uri info"); err++; } if (!err) { lotNr = rLotForDOCid(DocID); if ((fh = lotOpenFileNoCasheByLotNr(lotNr,"gced","a", 'e',subname)) == NULL) { perror("can't open gced file"); err++; } else { fwrite(&DocID, sizeof(DocID), 1, fh); fclose(fh); } } if (!err) { struct reformat *re; if((re = reopen(rLotForDOCid(DocID), sizeof(struct DocumentIndexFormat), "DocumentIndex", subname, RE_HAVE_4_BYTES_VERSION_PREFIX)) == NULL) { perror("can't reopen()"); err++; } else { DIS_delete(RE_DocumentIndex(re, DocID)); reclose(re); } } //markerer at den er skitten if (!err) { FILE *dirtfh; dirtfh = lotOpenFileNoCashe(DocID,"dirty","ab",'e',subname); fwrite("1",1,1,dirtfh); fclose(dirtfh); } if (err == 0) bbdocument_delete(uri, subname); } free(subname); intrespons = 1; // Always return ok for now if ((n=sendall(socket, &intrespons, sizeof(intrespons))) == -1) { perror("Cant recv filerest"); exit(1); } } else if (packedHedder.command == bbc_deletecollection) { printf("deletecollection\n"); char *subname, *uri; //subname if ((i=recv(socket, &intrespons, sizeof(intrespons),MSG_WAITALL)) == -1) { perror("Cant read intrespons"); exit(1); } subname = malloc(intrespons +1); if ((i=recv(socket, subname, intrespons,MSG_WAITALL)) == -1) { perror("Cant read subname"); exit(1); } subname[intrespons] = '\0'; printf("going to delete collection: %s\n", subname); intrespons = bbdocument_deletecoll(subname); if ((n=sendall(socket, &intrespons, sizeof(intrespons))) == -1) { perror("Cant recv filerest"); exit(1); } free(subname); } else if (packedHedder.command == bbc_addwhisper) { whisper_t add; char *subname; if ((i=recv(socket, &intrespons, sizeof(intrespons),MSG_WAITALL)) == -1) err(1, "Cant read intrespons"); subname = malloc(intrespons+1); if ((i=recv(socket, subname, intrespons,MSG_WAITALL)) == -1) { perror("Cant read subname"); exit(1); } subname[intrespons] = '\0'; if ((i=recv(socket, &add, sizeof(add),MSG_WAITALL)) == -1) err(1, "Cant read add whisper"); gcwhisper_write(subname, add); free(subname); } else if (packedHedder.command == bbc_HasSufficientSpace) { char *subname; //subname if ((i=recvall(socket, &intrespons, sizeof(intrespons))) == 0) { perror("Cant read intrespons"); exit(1); } subname = malloc(intrespons +1); if ((i=recvall(socket, subname, intrespons)) == 0) { perror("Cant read subname"); exit(1); } // tester bare i lot 1 her. Må også sjekke andre loter når vi begynner å støtte frlere disker på ES. intrespons = lotHasSufficientSpace(1, 4096, subname); if ((n=sendall(socket, &intrespons, sizeof(intrespons))) == -1) { perror("Cant recv filerest"); exit(1); } printf("~Asked for HasSufficientSpace for subname \"%s\". Returnerer %d\n",subname, intrespons); free(subname); } else { printf("unnown comand. %i\n", packedHedder.command); } } ++count; // #ifdef DEBUG_BREAK_AFTER // if (count >= DEBUG_BREAK_AFTER) { // printf("exeting after %i docoments\n",count); // exit(1); // } // #endif } #ifdef DEBUG_TIME gettimeofday(&tot_end_time, NULL); printf("Time debug: bbdn total time time: %f\n",getTimeDifference(&tot_start_time, &tot_end_time)); #endif }
/* finner riktig fil og Søker seg frem til riktig adresse, slik at man bare kan lese/skrive */ FILE *GetFileHandler (unsigned int DocID,char type,char subname[], char *diname) { #ifndef DI_FILE_CASHE FILE *DocumentIndexHA = NULL; #endif int LotNr; char FileName[128]; char FilePath[128]; //finner lot for denne DocIDen LotNr = rLotForDOCid(DocID); //hvis filen ikke er open åpner vi den //segfeiler en skjelden gang #ifdef DI_FILE_CASHE if ((LotNr == openDocumentIndex) && (diname == NULL || strcmp(openName,diname) == 0)) { } #else if(0) { } #endif else { GetFilPathForLot(FilePath,LotNr,subname); strncpy(FileName,FilePath,128); strncat(FileName,diname == NULL ? "DocumentIndex" : diname,128); #ifdef DI_FILE_CASHE printf("openig di file \"%s\"\n",FileName); #endif #ifdef DI_FILE_CASHE //hvis vi har en open fil lukkes denne if (openDocumentIndex != -1) { //segfeiler her for searchkernel //18,okt segefeiler her igjen ???? fclose(DocumentIndexHA); } #endif //prøver først å åpne for lesing if (type == 'c') { //temp: setter filopning til r+ for å få til å samarbeid melom DIRead og DIwrite //dette gjør at søk ikke funker på web på grun av rettighter :-( if ((DocumentIndexHA = fopen(FileName,"r+b")) == NULL) { printf("%d: cant open file %s for c\n", __LINE__, FileName); perror(FileName); return NULL; } } else if (type == 'r') { //temp: setter filopning til r+ for å få til å samarbeid melom DIRead og DIwrite //dette gjør at søk ikke funker på web på grun av rettighter :-( if ((DocumentIndexHA = fopen(FileName,"r+b")) == NULL) { printf("%d: cant open file %s for r\n",__LINE__, FileName); perror(FileName); return NULL; } } else if (type == 's') { //en ekte r read if ((DocumentIndexHA = fopen(FileName,"rb")) == NULL) { printf("%d: cant open file %s for rb\n", __LINE__, FileName); perror(FileName); return NULL; } } else if (type == 'w'){ if ((DocumentIndexHA = fopen(FileName,"r+b")) == NULL) { //hvis det ikke går lager vi og åpne filen makePath(FilePath); if ((DocumentIndexHA = fopen(FileName,"w+b")) == NULL) { perror(FileName); return NULL; } } } #ifdef DI_FILE_CASHE openDocumentIndex = LotNr; strscpy(openName,diname == NULL ? "DocumentIndex" : diname,sizeof(openName)); #endif } //søker til riktig post if (fseek(DocumentIndexHA,DIPostAdress(DocID),0) != 0) { perror("Can't seek"); exit(1); } return DocumentIndexHA; }
int main (int argc, char *argv[]) { int lotNr; int i; unsigned int DocID; char text[50]; unsigned int radress; unsigned int rsize; char **Data; int Count, TokCount; unsigned short hits; unsigned long WordID; int bucket; int y; int nr; FILE *revindexFilesHa[NrOfDataDirectorys]; unsigned char lang; FILE *FH; unsigned int DocIDPlace; int *nrOfLinkWordsToDocID = malloc(sizeof(int) * NrofDocIDsInLot); for (i=0;i<NrofDocIDsInLot;i++) { //begynner på 2000 så det skal være lett og skille de visuelt fra andre hits nrOfLinkWordsToDocID[i] = 2000; } //tester for at vi har fåt hvilken lot vi skal bruke if (argc < 3) { printf("Usage: ./anchorread lotnr subname\n\n"); exit(1); } lotNr = atoi(argv[1]); char *subname = argv[2]; if ( (FH = lotOpenFileNoCasheByLotNr(lotNr,"anchors","rb", 's',subname)) == NULL) { printf("lot dont have a anchors file\n"); exit(1); } fclose(FH); revindexFilesOpenLocal(revindexFilesHa,lotNr,"Anchor","wb",subname); //int anchorGetNext (int LotNr,unsigned int *DocID,char *text,unsigned int *radress,unsigned int *rsize) while (anchorGetNext(lotNr,&DocID,text,sizeof(text),&radress,&rsize,subname) ) { DocIDPlace = (DocID - LotDocIDOfset(rLotForDOCid(DocID))); ++nrOfLinkWordsToDocID[DocIDPlace]; convert_to_lowercase((unsigned char *)text); #ifdef DEBUG if (DocID == 4999999) { printf("DocID %i, text: \"%s\", DocIDPlace %i, nrOfLinkWordsToDocID %i\n",DocID,text,DocIDPlace,nrOfLinkWordsToDocID[DocIDPlace]); } #endif if ((TokCount = split(text, " ", &Data)) == -1) { printf("canæt splitt \"%s\"\n",text); } //for (i=(TokCount-1);i>=0;i--) { i=0; while (Data[i] != NULL) { /* if (nrOfLinkWordsToDocID[DocIDPlace] > 65505) { #ifdef DEBUG if (DocID == 4999999) { printf("reach max nr of words for DocID %u. Hav %i+ words\n",DocID,nrOfLinkWordsToDocID[DocIDPlace]); } #endif break; } */ if (Data[i][0] == '\0') { #ifdef DEBUG if (DocID == 4999999) { printf("emty data element\n"); } #endif } else if (strcmp(Data[i],"www") == 0) { #ifdef DEBUG if (DocID == 4999999) { printf("www\n"); } #endif ++nrOfLinkWordsToDocID[DocIDPlace]; } else if (isStoppWord(Data[i])) { #ifdef DEBUG if (DocID == 4999999) { printf("stopword \"%s\"\n",Data[i]); } #endif //++nrOfLinkWordsToDocID[DocIDPlace]; } else { #ifdef DEBUG if (DocID == 4999999) { printf("\t\"%s\" %i\n",Data[i],nrOfLinkWordsToDocID[DocIDPlace]); } #endif WordID = crc32boitho(Data[i]); if (WordID == 0) { printf("got 0 as word id for \"%s\". Somthing may be wrong.\n",Data[i]); } bucket = WordID % NrOfDataDirectorys; if (nrOfLinkWordsToDocID[DocIDPlace] > 65535) { hits = 65535; } else { hits = nrOfLinkWordsToDocID[DocIDPlace]; } #ifdef DEBUG if (DocID == 4999999) { printf("\thits %i: \"%s\": %hu, bucket %i\n",i,Data[i],hits,bucket); } #endif if (fwrite(&DocID,sizeof(unsigned int),1,revindexFilesHa[bucket]) != 1) { perror("fwrite DocID"); } //runarb: 13 mai 2007. vi har byttet til å bruke et tal for språk. //burde da dette fra DocumentIndex hvis det finnes, men lagres ikke der //må si i IndexRes på hvordan vi gjør det der //fprintf(revindexFilesHa[bucket],"aa "); lang = 0; nr = 1; if(fwrite(&lang,sizeof(unsigned char),1,revindexFilesHa[bucket]) != 1) { perror("fwrite lang"); } if(fwrite(&WordID,sizeof(unsigned long),1,revindexFilesHa[bucket]) != 1) { perror("fwrite WordID"); } if(fwrite(&nr,sizeof(unsigned long),1,revindexFilesHa[bucket]) != 1) { perror("fwrite nr"); } if(fwrite(&hits,sizeof(unsigned short),1,revindexFilesHa[bucket]) != 1) { perror("fwrite hits"); } ++nrOfLinkWordsToDocID[DocIDPlace]; } ++i; } FreeSplitList(Data); #ifdef DEBUG if (DocID == 4999999) { printf("\n"); } #endif } free(nrOfLinkWordsToDocID); }
int main(int argc, char *argv[]) { int sockfd, n; int i,y; char *strpointer; int res; char buf[MAXDATASIZE]; struct hostent *he; struct sockaddr_in their_addr; // connector's address information FILE *LOGFILE; //char hostName[] = "localhost"; //char hostName[] = "127.0.0.1"; char hostName[] = "bbs-001.boitho.com"; struct SiderFormat Sider[MaxsHits]; struct SiderHederFormat SiderHeder; char buff[64]; //generell buffer struct in_addr ipaddr; struct QueryDataForamt QueryData; //send out an HTTP header: printf("Content-type: text/xml\n\n"); //hvis vi har argumeneter er det første et query if (getenv("QUERY_STRING") == NULL) { if (argc < 2 ) { printf("Error ingen query spesifisert.\n\nEksempel på bruk for å søke på boitho:\n\tsearchkernel boitho\n\n\n"); } else { QueryData.query[0] = '\0'; for(i=1;i<argc ;i++) { sprintf(QueryData.query,"%s %s",QueryData.query,argv[i]); } //strcpy(QueryData.query,argv[1]); //printf("argc :%i %s %s\n",argc,argv[1],argv[2]); printf("query %s\n",QueryData.query); } } else { // Initialize the CGI lib res = cgi_init(); // Was there an error initializing the CGI??? if (res != CGIERR_NONE) { printf("Error # %d: %s<p>\n", res, cgi_strerror(res)); exit(0); } if (cgi_getentrystr("query") == NULL) { perror("Did'n receive any query."); } else { strncat(QueryData.query,cgi_getentrystr("query"),sizeof(QueryData.query)); } } if (strlen(QueryData.query) > MaxQueryLen -1) { printf("query to long\n"); exit(1); } //gjør om til liten case for(i=0;i<strlen(QueryData.query);i++) { QueryData.query[i] = tolower(QueryData.query[i]); } if ((he=gethostbyname(hostName)) == NULL) { // get the host info perror("gethostbyname"); exit(1); } if ((sockfd = socket(AF_INET, SOCK_STREAM, 0)) == -1) { perror("socket"); exit(1); } their_addr.sin_family = AF_INET; // host byte order their_addr.sin_port = htons(PORT); // short, network byte order their_addr.sin_addr = *((struct in_addr *)he->h_addr); memset(&(their_addr.sin_zero), '\0', 8); // zero the rest of the struct if (connect(sockfd, (struct sockaddr *)&their_addr, sizeof(struct sockaddr)) == -1) { perror("connect"); exit(1); } struct queryNodeHederFormat queryNodeHeder; //kopierer inn query strncpy(queryNodeHeder.query,QueryData.query,sizeof(queryNodeHeder.query) -1); //sender forespørsel sendall(sockfd,queryNodeHeder.query,sizeof(queryNodeHeder)); //motter hedderen for svaret if ((i=recv(sockfd, &SiderHeder, sizeof(SiderHeder),MSG_WAITALL)) == -1) { perror("recv"); } //printf("TotaltTreff %i,showabal %i,filtered %i,total_usecs %f\n",SiderHeder.TotaltTreff,SiderHeder.showabal,SiderHeder.filtered,SiderHeder.total_usecs); for(i=0;i<SiderHeder.showabal;i++) { if ((n=recv(sockfd, &Sider[i], sizeof(struct SiderFormat),MSG_WAITALL)) == -1) { perror("recv"); } //printf("url: %s\n",Sider[i].DocumentIndex.Url); } close(sockfd); y=0; //fjerner tegn som er eskapet med \, eks \" blir til " for(i=0;i<strlen(QueryData.query);i++) { if (QueryData.query[i] == '\\') { switch(QueryData.query[++i]) { case '"': //" buff[y++] = '&'; buff[y++] = 'q'; buff[y++] = 'u'; buff[y++] = 'o'; buff[y++] = 't'; buff[y++] = ';'; break; } //else { // printf("error: found \\ but no case\n"); //} } else { //printf("%c\n",QueryData.query[i]); buff[y++] = QueryData.query[i]; } } strncpy(QueryData.query,buff,sizeof(QueryData.query) -1); printf("<?xml version=\"1.0\" encoding=\"ISO-8859-1\" ?> \n"); printf("<!DOCTYPE family SYSTEM \"http://www.boitho.com/xml/search.dtd\"> \n"); printf("<search>\n"); printf("<treff-info totalt=\"%i\" query=\"%s\" hilite=\"%s\" tid=\"%f\" filtered=\"%i\" showabal=\"%i\"/>\n",SiderHeder.TotaltTreff,QueryData.query,SiderHeder.hiliteQuery,SiderHeder.total_usecs,SiderHeder.filtered,SiderHeder.showabal); for(i=0;i<SiderHeder.showabal;i++) { if (!Sider[i].deletet) { //filtrerer ut tegn som ikke er lov i xml while ((strpointer = strchr(Sider[i].DocumentIndex.Url,'&')) != NULL) { (*strpointer) = 'a'; } //while ((strpointer = strchr(Sider[i].title,'&')) != NULL) { // (*strpointer) = 'a'; //} //while ((strpointer = strchr(Sider[i].description,'&')) != NULL) { // (*strpointer) = 'a'; //} printf("<treff>\n"); printf("\t<DocID>%i-%i</DocID>\n",Sider[i].iindex.DocID,rLotForDOCid(Sider[i].iindex.DocID)); printf("\t<POSISJON>%i</POSISJON>\n",i +1); //DocumentIndex printf("\t<Url>%s</Url>\n",Sider[i].DocumentIndex.Url); printf("\t<Title>%s</Title>\n",Sider[i].title); printf("\t<AdultWeight>%hu</AdultWeight>\n",Sider[i].DocumentIndex.AdultWeight); printf("\t<Sprok>%s</Sprok>\n",Sider[i].DocumentIndex.Sprok); //temp: blir rare tegn her printf("\t<Dokumenttype>%s</Dokumenttype>\n",Sider[i].DocumentIndex.Dokumenttype); printf("\t<RepositorySize>%u</RepositorySize>\n",Sider[i].DocumentIndex.htmlSize); printf("\t<THUMBNALE>%s</THUMBNALE>\n",Sider[i].thumbnale); printf("\t<CACHE>%s</CACHE>\n",Sider[i].cacheLink); printf("\t<IMAGEWIDTH>100</IMAGEWIDTH>\n"); printf("\t<IMAGEHEIGHT>100</IMAGEHEIGHT>\n"); printf("\t<METADESCRIPTION></METADESCRIPTION>\n"); printf("\t<CATEGORY></CATEGORY>\n"); printf("\t<OFFENSIVE_CODE>FALSE</OFFENSIVE_CODE>\n"); printf("\t<beskrivelse>%s</beskrivelse>\n",Sider[i].description); printf("\t<TermRank>%i</TermRank>\n",Sider[i].iindex.TermRank); printf("\t<PopRank>%i</PopRank>\n",Sider[i].iindex.PopRank); printf("\t<allrank>%i</allrank>\n",Sider[i].iindex.allrank); ipaddr.s_addr = Sider[i].DocumentIndex.IPAddress; printf("\t<IPAddress>%s</IPAddress>\n",inet_ntoa(ipaddr)); printf("\t<RESPONSE>%hu</RESPONSE>\n",Sider[i].DocumentIndex.response); printf("\t<NrOfHits>%i</NrOfHits>\n",Sider[i].iindex.TermAntall); //printer ut hits (hvor i dokumenetet orde befinner seg ). printf("\t<hits>"); for (y=0; (y < Sider[i].iindex.TermAntall) && (y < MaxTermHit); y++) { printf("%hu ",Sider[i].iindex.hits[y]); } printf("</hits>\n"); printf("</treff>\n"); } } printf("</search>\n"); //ToDo: må ha låsing her if ((LOGFILE = fopen("/home/boitho/config/query.log","a")) == NULL) { perror("logfile"); } fprintf(LOGFILE,"%s %i\n",queryNodeHeder.query,SiderHeder.TotaltTreff); fclose(LOGFILE); return 0; }
int main (int argc, char *argv[]) { FILE *LINKDBFILE; FILE *INDEXFILE; unsigned int ranged; struct linkdb_block linkdbPost; off64_t offset; int lastLotNr, lotNr = -1; unsigned int lastDocID; if (argc < 3) { printf("Dette programet tar inn en linkdb fil og gjør den søkbar\n\n\tUsage: ./BrankCalculate linkdb indexfile\n"); exit(0); } if ((LINKDBFILE = (FILE *)fopen64(argv[1],"rb")) == NULL) { printf("Cant read linkdb "); perror(argv[1]); exit(1); } if ((INDEXFILE = (FILE *)fopen64(argv[2],"wb")) == NULL) { printf("Cant read index "); perror(argv[2]); exit(1); } ranged = 0; lastDocID = 0; while (!feof(LINKDBFILE)) { fread(&linkdbPost,sizeof(linkdbPost),1,LINKDBFILE); // lotNr = rLotForDOCid(linkdbPost.DocID_to); if (lastLotNr != lotNr) { printf("%i\n",lotNr); } lastLotNr = lotNr; if (linkdbPost.DocID_to != lastDocID) { //printf("\nnew\n"); //tar vare på ofsett // -sizeof(offset) da vi skal ha starten. Vi har jo allerede lest en //offset = (ftello64(LINKDBFILE) - sizeof(linkdbPost)); offset = ftello64(LINKDBFILE); //printf("offset %li\n",offset); //søker oss til riktig plass fseeko64(INDEXFILE,linkdbPost.DocID_to * sizeof(offset),SEEK_SET); //for så å skrive dette til fil fwrite(&offset,sizeof(offset),1,INDEXFILE); } //printf("%u -> %u\n",linkdbPost.DocID_from,linkdbPost.DocID_to); lastDocID = linkdbPost.DocID_to; //if (ranged > 500) { // break; //} ++ranged; } fclose(LINKDBFILE); fclose(INDEXFILE); printf("Rangerte %lu linker\n",ranged); }
//gir andre tilgan til lot filer. Casher opne filhandlere FILE *lotOpenFile(unsigned int DocID,char resource[],char type[], char lock,char subname[]) { int LotNr; int i; char FilePath[128]; char File [128]; if (!LotFilesInalisert) { for(i=0; i < MaxOpenFiles; i++) { OpenFiles[i].LotNr = -1; } LotFilesInalisert = 1; } File[0] = '\0'; //finner i hvilken lot vi skal lese fra LotNr = rLotForDOCid(DocID); //printf("LotNr: %i, DocID: %i\n",LotNr,DocID); //begynner med å søke cashen. Lopper til vi enten er ferdig, eller til vi har funne ønskede i cashen i = 0; while ((i < MaxOpenFiles) && (OpenFiles[i].LotNr != LotNr)) { i++; } //temp: skrur av søking her med i=0 //type of og subname er også lagt til uten at det tar hensyn til det i søket i = 0; //hvis vi fant i casehn returnerer vi den if (OpenFiles[i].LotNr == LotNr && (strcmp(OpenFiles[i].subname,subname) == 0) && (strcmp(OpenFiles[i].type,type)==0) && (strcmp(OpenFiles[i].resource,resource)==0) ) { #ifdef DEBUG printf("lotOpenFile: fant en tildigere åpnet fil, returnerer den.\n"); printf("lotOpenFile: returnerer: i %i, subname \"%s\", type \"%s\", LotNr %i\n",i,OpenFiles[i].subname,OpenFiles[i].type,OpenFiles[i].LotNr); printf("lotOpenFile: file is \"%s\"\n",OpenFiles[i].filename); printf("lotOpenFile: returning file handler %p\n",OpenFiles[i].FILEHANDLER); #endif if (OpenFiles[i].FILEHANDLER == NULL) { printf("Error: FILEHANDLER is NULL\n"); #ifdef DEBUG exit(-1); #endif } return OpenFiles[i].FILEHANDLER; } //hvis ikke åpner vi og returnerer else { //hvis dette er en åpen filhånterer, må vi lukke den if (OpenFiles[i].LotNr != -1) { printf("lotOpenFile: closeing: i %i\n",i); fclose(OpenFiles[i].FILEHANDLER); OpenFiles[i].LotNr = -1; } if ((OpenFiles[i].FILEHANDLER = lotOpenFileNoCasheByLotNr( LotNr, resource,type, lock,subname)) == NULL) { printf("lotOpenFileNoCashe: can't open file\n"); return NULL; } GetFilPathForLot(FilePath,LotNr,subname); strscpy(File,FilePath,sizeof(File)); strlcat(File,resource,sizeof(File)); strscpy(OpenFiles[i].filename,File,sizeof(OpenFiles[i].filename)); strscpy(OpenFiles[i].resource,resource,sizeof(OpenFiles[i].resource)); strscpy(OpenFiles[i].subname,subname,sizeof(OpenFiles[i].subname)); strscpy(OpenFiles[i].type,type,sizeof(OpenFiles[i].type)); //#ifdef DEBUG printf("lotOpenFile: opening file \"%s\" for %s\n",File,type); //#endif OpenFiles[i].LotNr = LotNr; return OpenFiles[i].FILEHANDLER; } }
int lotOpenFileNoCashel(unsigned int DocID,char resource[],char type[], char lock,char subname[]) { return lotOpenFileNoCasheByLotNrl(rLotForDOCid(DocID),resource,type,lock,subname); }
void connectHandler(int socket) { struct packedHedderFormat packedHedder; int i,n; int LotNr; char lotPath[512]; char buf[100]; unsigned int FilterTime; int filnamelen; FILE *FH; struct stat inode; // lager en struktur for fstat å returnere. off_t filesize; char c; struct DocumentIndexFormat DocumentIndexPost; int DocID; struct ReposetoryHeaderFormat ReposetoryHeader; unsigned int radress; char htmlbuffer[524288]; int destLeng; char dest[512]; off_t fileBloks,filerest; char *filblocbuff; //while ((i=read(socket, &packedHedder, sizeof(struct packedHedderFormat))) > 0) { while ((i=recv(socket, &packedHedder, sizeof(struct packedHedderFormat),MSG_WAITALL)) > 0) { //printf("command: %i\n",packedHedder.command); //printf("i er %i\n",i); printf("size is: %i\nversion: %i\ncommand: %i\n",packedHedder.size,packedHedder.version,packedHedder.command); //printf("subname: %s\n",packedHedder.subname); //lar size reflektere hva som er igjen av pakken packedHedder.size = packedHedder.size - sizeof(packedHedder); if (packedHedder.command == C_rmkdir) { printf("C_rmkdir\n"); //leser data. Det skal væren en int som sier hvilken lot vi vil ha if ((i=recv(socket, &LotNr, sizeof(LotNr),MSG_WAITALL)) == -1) { perror("Cant read lotnr"); exit(1); } //leser destinasjonelengden if ((i=recv(socket, &destLeng, sizeof(destLeng),MSG_WAITALL)) == -1) { perror("Cant read destLeng"); exit(1); } if (destLeng > sizeof(dest)) { printf("dest filname is to long at %i\n",destLeng); exit(1); } //leser destinasjonene if ((i=recv(socket, &dest, destLeng,MSG_WAITALL)) == -1) { perror("Cant read dest"); exit(1); } GetFilPathForLot(lotPath,LotNr,packedHedder.subname); sprintf(lotPath,"%s%s",lotPath,dest); printf("mkdir %s\n",lotPath); makePath(lotPath); printf("~C_rmkdir\n"); } else if (packedHedder.command == C_rComand) { //leser data. Det skal væren en int som sier hvilken lot vi vil ha if ((i=recv(socket, &LotNr, sizeof(LotNr),MSG_WAITALL)) == -1) { perror("Cant read lotnr"); exit(1); } //leser destinasjonelengden if ((i=recv(socket, &destLeng, sizeof(destLeng),MSG_WAITALL)) == -1) { perror("Cant read destLeng"); exit(1); } if (destLeng > sizeof(dest)) { printf("dest filname is to long at %i\n",destLeng); exit(1); } //leser destinasjonene if ((i=recv(socket, &dest, destLeng,MSG_WAITALL)) == -1) { perror("Cant read dest"); exit(1); } printf("run command %s\n",dest); system(dest); } else if (packedHedder.command == C_getLotToIndex) { printf("fikk C_getLotToIndex\n"); int dirty; if ((i=recv(socket, &dirty, sizeof(dirty),MSG_WAITALL)) == -1) { perror("Cant read dirty"); exit(1); } printf("dirty: %i\n",dirty); LotNr = findLotToIndex(packedHedder.subname,dirty); printf("sending respons\n"); sendall(socket,&LotNr, sizeof(LotNr)); } else if (packedHedder.command == C_getlotHasSufficientSpace) { printf("fikk C_getLotToIndex\n"); int needSpace; int response; if ((i=read(socket, &LotNr, sizeof(LotNr))) == -1) { perror("Cant read lotnr"); exit(1); } if ((i=recv(socket, &needSpace, sizeof(needSpace),MSG_WAITALL)) == -1) { perror("Cant read dirty"); exit(1); } printf("needSpace: %i, LotNr %i\n",needSpace,LotNr); response = lotHasSufficientSpace(LotNr, needSpace, packedHedder.subname); printf("sending respons\n"); sendall(socket,&response, sizeof(response)); } else if (packedHedder.command == C_rGetSize) { printf("fikk C_rGetSize\n"); //leser data. Det skal væren en int som sier hvilken lot vi vil ha if ((i=read(socket, &LotNr, sizeof(LotNr))) == -1) { perror("Cant read lotnr"); exit(1); } if ((i=read(socket, &filnamelen, sizeof(filnamelen))) == -1) { perror("Cant read filnamelen"); exit(1); } if (filnamelen > sizeof(buf)) { printf("filname to long\n"); }; if ((i=read(socket, buf, filnamelen)) == -1) { perror("Cant read filnamelen"); exit(1); } printf("filname %s\n",buf); if ((FH = lotOpenFileNoCasheByLotNr(LotNr,buf,"rb",'s',packedHedder.subname)) == NULL) { perror(buf); //sending that he fil is emty fileBloks = 0; sendall(socket,&fileBloks, sizeof(fileBloks)); } else { //finner og sender il størelse fstat(fileno(FH),&inode); //filesize = inode.st_size; //sendall(socket,&filesize, sizeof(filesize)); fileBloks = inode.st_size; printf("size is %" PRId64 "\n",fileBloks); sendall(socket,&fileBloks, sizeof(fileBloks)); fclose(FH); } } else if (packedHedder.command == C_rGetFile) { printf("fikk C_rGetFile\n"); //leser data. Det skal væren en int som sier hvilken lot vi vil ha if ((i=read(socket, &LotNr, sizeof(LotNr))) == -1) { perror("Cant read lotnr"); exit(1); } if ((i=read(socket, &filnamelen, sizeof(filnamelen))) == -1) { perror("Cant read filnamelen"); exit(1); } if (filnamelen > sizeof(buf)) { printf("filname to long\n"); }; if ((i=read(socket, buf, filnamelen)) == -1) { perror("Cant read filnamelen"); exit(1); } printf("filname %s\n",buf); if ((FH = lotOpenFileNoCasheByLotNr(LotNr,buf,"rb",'s',packedHedder.subname)) == NULL) { perror(buf); //sending that the fil is emty fileBloks = 0; filerest = 0; sendall(socket,&fileBloks, sizeof(fileBloks)); sendall(socket,&filerest, sizeof(filerest)); } else { //finner og sender fil størelse fstat(fileno(FH),&inode); //filesize = inode.st_size; //sendall(socket,&filesize, sizeof(filesize)); fileBloks = (int)floor(inode.st_size / rNetTrabsferBlok); filerest = inode.st_size - (fileBloks * rNetTrabsferBlok); sendall(socket,&fileBloks, sizeof(fileBloks)); sendall(socket,&filerest, sizeof(filerest)); printf("sending fil. fileBloks %"PRId64", filerest %"PRId64"\n",fileBloks,filerest); filblocbuff = (char *)malloc(rNetTrabsferBlok); for(i=0; i < fileBloks; i++) { //fread(filblocbuff,sizeof(c),rNetTrabsferBlok,FH); //fread_all(const void *buf, size_t size, FILE *stream) fread_all(filblocbuff,rNetTrabsferBlok,FH, 4096); if ((n=sendall(socket, filblocbuff, rNetTrabsferBlok)) == -1) { perror("Cant recv dest"); exit(1); } } printf("did recv %i fileBloks\n",i); fread(filblocbuff,sizeof(c),filerest,FH); if ((n=sendall(socket, filblocbuff, filerest)) == -1) { perror("Cant recv filerest"); exit(1); } free(filblocbuff); /* for (i=0;i<filesize;i++) { fread(&c,sizeof(char),1,FH); send(socket, &c, sizeof(char), 0); //printf("%i\n",(int)c); } */ printf("send file end\n"); fclose(FH); } } else if (packedHedder.command == C_rGetNext) { printf("fikk C_rGetNext\n"); printf("støttes ikke lengere"); exit(1); /* //leser data. Det skal væren en unigned int som sier hvilken lot vi vil ha //har deklarert den som int her ??? if ((i=read(socket, &LotNr, sizeof(LotNr))) == -1) { perror("Cant read lotnr"); exit(1); } printf("leser FilterTime\n"); //leser filtertime if ((i=read(socket, &FilterTime, sizeof(FilterTime))) == -1) { perror("Cant read lotnr"); exit(1); } printf("lotnr %i FilterTime %u\n",LotNr,FilterTime); //henter inn data om den lotten if (rGetNext(LotNr,&ReposetoryHeader,htmlbuffer,NULL,&radress,FilterTime,0)) { //printf("DocId: %i url: %s\n",ReposetoryHeader.DocID,ReposetoryHeader.url); //sender pakke hedder sendpacked(socket,C_rLotData,PROTOCOLVERSION, ReposetoryHeader.htmlSize + sizeof(ReposetoryHeader) +sizeof(radress), NULL,packedHedder.subname); //sennder ReposetoryHeader'en sendall(socket,&ReposetoryHeader, sizeof(ReposetoryHeader)); //sender htmlen sendall(socket,&htmlbuffer, ReposetoryHeader.htmlSize); //sender adressen sendall(socket,&radress,sizeof(radress)); //printf("data sent\n"); //printf("rGetNext: %i\n",ReposetoryHeader.DocID); } else { sendpacked(socket,C_rEOF,PROTOCOLVERSION, 0, NULL,packedHedder.subname); printf("ferdig\n"); } */ } else if (packedHedder.command == C_DIWrite) { if ((i=recv(socket, &DocumentIndexPost, sizeof(struct DocumentIndexFormat),MSG_WAITALL)) == -1) { perror("recv"); exit(1); } if ((i=recv(socket, &DocID, sizeof(DocID),MSG_WAITALL)) == -1) { perror("recv"); exit(1); } DIWrite(&DocumentIndexPost,DocID,packedHedder.subname, NULL); //printf("DIWrite: %i\n",DocID); } else if (packedHedder.command == C_DIRead) { int DocID; struct DocumentIndexFormat DocumentIndexPost; printf("got commane C_DIRead. sise %i hsize %i ds %i\n",packedHedder.size, sizeof(packedHedder), sizeof(DocID)); if ((i=recv(socket, &DocID, sizeof(DocID),0)) == -1) { perror("recv"); exit(1); } //printf("DocID %i\n",DocID); //leser inn datan //int DIRead (struct DocumentIndexFormat *DocumentIndexPost, int DocID); DIRead(&DocumentIndexPost,DocID,packedHedder.subname); sendall(socket,&DocumentIndexPost, sizeof(struct DocumentIndexFormat)); } else if (packedHedder.command == C_rGetIndexTime) { int Lotnr; unsigned int IndexTime; if ((i=recv(socket, &LotNr, sizeof(LotNr),0)) == -1) { perror("recv"); exit(1); } IndexTime = GetLastIndexTimeForLot(LotNr,packedHedder.subname); sendall(socket,&IndexTime, sizeof(IndexTime)); } else if (packedHedder.command == C_rSetIndexTime) { int Lotnr; if ((i=recv(socket, &LotNr, sizeof(LotNr),0)) == -1) { perror("recv"); exit(1); } setLastIndexTimeForLot(LotNr,NULL,packedHedder.subname); } else if (packedHedder.command == C_rSendFile) { //skal mota en fil for lagring i reposetoryet //char FilePath[156]; FILE *FILEHANDLER; char c; char opentype[2]; //char *filblocbuff; //off_t fileBloks,filerest; if ((i=recv(socket, &LotNr, sizeof(LotNr),MSG_WAITALL)) == -1) { perror("Cant recv lotnr"); exit(1); } printf("lotNr %i\n",LotNr); //leser destinasjonelengden if ((i=recv(socket, &destLeng, sizeof(destLeng),MSG_WAITALL)) == -1) { perror("Cant recv destLeng"); exit(1); } if (destLeng > sizeof(dest)) { printf("dest filname is to long at %i\n",destLeng); exit(1); } //leser destinasjonene if ((i=recv(socket, &dest, destLeng,MSG_WAITALL)) == -1) { perror("Cant recv dest"); exit(1); } printf("coping %s as length %i in to lot %i\n",dest,destLeng,LotNr); if ((i=recv(socket, &opentype, sizeof(char) +1,MSG_WAITALL)) == -1) { perror("Cant recv opentype"); exit(1); } printf("opentype \"%s\"\n",opentype); //GetFilPathForLot(FilePath,LotNr,packedHedder.subname); //legger til filnavnet //strncat(FilePath,dest,sizeof(FilePath)); //leser inn filstørelsen if ((i=recv(socket, &fileBloks, sizeof(fileBloks),MSG_WAITALL)) == -1) { perror("Cant recv fileBloks"); exit(1); } if ((i=recv(socket, &filerest, sizeof(filerest),MSG_WAITALL)) == -1) { perror("Cant recv filerest"); exit(1); } printf("fileBloks: %" PRId64 ", filerest: %" PRId64 "\n",fileBloks,filerest); //åpner filen if ((FILEHANDLER = lotOpenFileNoCasheByLotNr(LotNr,dest,opentype,'e',packedHedder.subname)) == NULL) { perror(dest); } filblocbuff = (char *)malloc(rNetTrabsferBlok); for(i=0; i < fileBloks; i++) { if ((n=recv(socket, filblocbuff, rNetTrabsferBlok,MSG_WAITALL)) == -1) { perror("Cant recv dest"); exit(1); } fwrite(filblocbuff,sizeof(c),rNetTrabsferBlok,FILEHANDLER); } printf("did recv %i fileBloks\n",i); if ((n=recv(socket, filblocbuff, filerest,MSG_WAITALL)) == -1) { perror("Cant recv filerest"); exit(1); } fwrite(filblocbuff,sizeof(c),filerest,FILEHANDLER); free(filblocbuff); fclose(FILEHANDLER); printf("\n"); } else if (packedHedder.command == C_DIGetIp) { unsigned int DocID; struct DocumentIndexFormat DocumentIndexPost; //printf("got command C_DIGetIp\n"); if ((i=recv(socket, &DocID, sizeof(DocID),MSG_WAITALL)) == -1) { perror("recv"); exit(1); } //printf("DocID %u\n",DocID); DIRead(&DocumentIndexPost,DocID,packedHedder.subname); //printf("ipadress: %u\n",DocumentIndexPost.IPAddress); sendall(socket,&DocumentIndexPost.IPAddress, sizeof(DocumentIndexPost.IPAddress)); } else if (packedHedder.command == C_anchorAdd) { size_t textlen; unsigned int DocID; char *text; printf("Add anchor....\n"); if ((i = recv(socket, &DocID, sizeof(DocID),MSG_WAITALL)) == -1) { perror("recv"); exit(1); } else if ((i = recv(socket, &textlen, sizeof(textlen), MSG_WAITALL)) == -1) { perror("recv(textlen)"); exit(1); } text = malloc(textlen+1); text[textlen] = '\0'; if ((i = recv(socket, text, textlen, MSG_WAITALL)) == -1) { perror("recv(text)"); exit(1); } anchoraddnew(DocID, text, textlen, packedHedder.subname, NULL); printf("Text for %d: %s\n", DocID, text); free(text); } else if (packedHedder.command == C_anchorGet) { size_t len; char *text; int LotNr; unsigned int DocID; printf("Get anchor...\n"); if ((i = recv(socket, &DocID, sizeof(DocID),MSG_WAITALL)) == -1) { perror("recv"); exit(1); } printf("got DocID %u\n",DocID); LotNr = rLotForDOCid(DocID); printf("trying to read anchor\n"); len = anchorRead(LotNr, packedHedder.subname, DocID, NULL, -1); printf("got anchor of length %i\n",len); sendall(socket, &len, sizeof(len)); text = malloc(len+1); printf("readint it again\n"); anchorRead(LotNr, packedHedder.subname, DocID, text, len+1); sendall(socket, text, len); } else if (packedHedder.command == C_readHTML) { /* unsigned int DocID; unsigned int len; char *text; char *acla, *acld; struct DocumentIndexFormat DocIndex; struct ReposetoryHeaderFormat ReposetoryHeader; if ((i = recv(socket, &DocID, sizeof(DocID), MSG_WAITALL)) == -1) { perror("recv"); exit(1); } if ((i = recv(socket, &len, sizeof(len), MSG_WAITALL)) == -1) { perror("recv(len)"); exit(1); } printf("len %u\n",len); text = malloc(len); if (text == NULL) exit(1); DIRead(&DocIndex, DocID, packedHedder.subname); if (!rReadHtml( text, &len, DocIndex.RepositoryPointer, DocIndex.htmlSize, DocID, packedHedder.subname, &ReposetoryHeader, &acla, &acld, DocIndex.imageSize)) { len = 0; sendall(socket, &len, sizeof(len)); } else { ++len; // \0 #ifdef DEBUG printf("docID %u\n",DocID); printf("Got: (len %i, real %i) ########################\n%s\n#####################\n", len, strlen(text), text); #endif sendall(socket, &len, sizeof(len)); sendall(socket, text, len); sendall(socket, &ReposetoryHeader,sizeof(ReposetoryHeader)); } free(text); */ } /* runarb: 06 des 2007: vi har gåt bort fra denne metoden for nå, og bruker heller index over smb. Men tar vare på den da vi kan trenge den siden else if (packedHedder.command == C_urltodocid) { char cmd; int alloclen; char *urlbuf; if (urltodociddb == NULL) { cmd = C_DOCID_NODB; sendall(socket, &cmd, sizeof(cmd)); exit(1); } else { cmd = C_DOCID_READY; sendall(socket, &cmd, sizeof(cmd)); } cmd = C_DOCID_NEXT; alloclen = 1024; urlbuf = malloc(alloclen); do { unsigned int DocID; size_t len; if ((i = recv(socket, &cmd, sizeof(cmd), MSG_WAITALL)) == -1) { err(1, "recv(cmd)"); } if (cmd == C_DOCID_DONE) break; if ((i == recv(socket, &len, sizeof(len), MSG_WAITALL)) == -1) { err(1, "recv(len)"); } if (alloclen < len+1) { free(urlbuf); alloclen *= 2; urlbuf = malloc(alloclen); } if ((i == recv(socket, urlbuf, len, MSG_WAITALL)) == -1) { err(1, "recv(len)"); } urlbuf[len] = '\0'; if (!getDocIDFromUrl(urltodociddb, urlbuf, &DocID)) { cmd = C_DOCID_NOTFOUND; sendall(socket, &cmd, sizeof(cmd)); } else { cmd = C_DOCID_FOUND; sendall(socket, &cmd, sizeof(cmd)); sendall(socket, &DocID, sizeof(DocID)); } } while (1); free(urlbuf); } */ else { printf("unnown comand. %i\n", packedHedder.command); } //printf("size is: %i\nversion: %i\ncommand: %i\n",packedHedder.size,packedHedder.version,packedHedder.command); } //while }
int main (int argc, char *argv[]) { struct DocumentIndexFormat DocumentIndexPost; int PopRankextern; int PopRankintern; int PopRanknoc; int PopRanindex; char ShortRank; FILE *FH; struct popl popextern; struct popl popintern; struct popl popnoc; struct popl popindex; uLong htmlBufferSize = 0; char *htmlBuffer = NULL; char *acl_allowbuffer = NULL; char *acl_deniedbuffer = NULL; char timebuf[26]; int optShowhtml = 0; int optShowWords = 0; int optSummary = 0; int optAnchor = 0; int optResource = 0; int optPopRank = 0; int optDelete = 0; int optAdult = 0; unsigned int DocID; char *subname; if (getenv("QUERY_STRING") == NULL) { extern char *optarg; extern int optind, opterr, optopt; char c; while ((c=getopt(argc,argv,"hwsarpdu"))!=-1) { switch (c) { case 'h': optShowhtml = 1; break; case 'u': optAdult = 1; break; case 'w': optShowWords = 1; break; case 's': optSummary = 1; break; case 'a': optAnchor = 1; break; case 'p': optPopRank = 1; break; case 'r': optResource = 1; break; case 'd': optDelete = 1; break; default: exit(1); } } --optind; #ifdef DEBUG printf("argc %i, optind %i\n",argc,optind); #endif if ((argc - optind)!= 3) { printf("Dette programet gir info om en DocID\n\n\tUsage PageInfo DocID collection\n"); exit(1); } DocID = atol(argv[1 +optind]); subname = argv[2 +optind]; } else { printf("Content-type: text/plain\n\n"); int res; // Initialize the CGI lib res = cgi_init(); // Was there an error initializing the CGI??? if (res != CGIERR_NONE) { printf("Error # %d: %s<p>\n", res, cgilib_strerror(res)); fprintf(stderr,"Cgi-lib error."); return -1; } if (cgi_getentrystr("subname") == NULL) { fprintf(stderr,"Didn't recieve any subname."); return -1; } else { subname = cgi_getentrystr("subname"); } if (cgi_getentrystr("DocID") == NULL) { fprintf(stderr,"Didn't recieve any DocID."); return -1; } else { DocID = atol( cgi_getentrystr("DocID") ); } } html_parser_init(); printf("Showing data for Collection \"%s\", DocID %u\n\n",subname,DocID); printf("Lot: %i\n",rLotForDOCid(DocID)); if (optDelete) { memset(&DocumentIndexPost,'\0',sizeof(DocumentIndexPost)); DIWrite(&DocumentIndexPost,DocID,subname,NULL); return 0; } if (DIRead_fmode(&DocumentIndexPost,DocID,subname,'s')) { printf("Url: \"%s\"\nLanguage: %s (id: %s)\nOffensive code: %hu\nDocument type: %s\nTime tested sins last good crawl: %hu\nAdult weight: %hu\nResource size: %u\nIP Address: %u\nHtml size: %i\nImage size: %i\nUser ID: %i\nCrawler version: %f\nRepository pointer: %u\n", DocumentIndexPost.Url, getLangCode2(atoi(DocumentIndexPost.Sprok)), DocumentIndexPost.Sprok, DocumentIndexPost.Offensive_code, DocumentIndexPost.Dokumenttype, DocumentIndexPost.AntallFeiledeCrawl, DocumentIndexPost.AdultWeight, DocumentIndexPost.ResourceSize, DocumentIndexPost.IPAddress, DocumentIndexPost.htmlSize2, DocumentIndexPost.imageSize, DocumentIndexPost.userID, DocumentIndexPost.clientVersion, DocumentIndexPost.RepositoryPointer); if (DocumentIndexPost.response == 200) { printf("HTTP response: %hu\n",DocumentIndexPost.response); } else { printf("HTTP response: \033[1;31m%hu\033[0m\n",DocumentIndexPost.response); } ctime_r((time_t *)&DocumentIndexPost.CrawleDato,timebuf); timebuf[24] = '\0'; printf("Last crawled time: %u\n",DocumentIndexPost.CrawleDato); printf("Last crawled time ISO: %s\n",timebuf); printf("crc32: %u\n",DocumentIndexPost.crc32); #ifdef BLACK_BOX printf("Last seen Unix: %u\n",DocumentIndexPost.lastSeen); printf("Last seen ISO: %s", ctime(&DocumentIndexPost.lastSeen)); #endif printf("Nr of out links: %u\n",(unsigned int)DocumentIndexPost.nrOfOutLinks); char *metadesc, *title, *body; if (DocumentIndexPost.SummarySize == 0) { printf("Summary: Don't have pre-parsed summery (summary size is 0)\n"); } else if (rReadSummary(DocID,&metadesc, &title, &body,DocumentIndexPost.SummaryPointer,DocumentIndexPost.SummarySize,subname)) { printf("\nSummary:\n"); printf("\tSummary pointer: %u\n\tSummary size: %hu\n",DocumentIndexPost.SummaryPointer,DocumentIndexPost.SummarySize); printf("\tTitle from summary: \"%s\"\n\tMeta description from summary: \"%s\"\n",title,metadesc); if (optSummary) { printf("Summary body\n*******************\n%s\n*******************\n\n",body); } } else { printf("Don't have pre-parsed summery\n"); } struct ReposetoryHeaderFormat ReposetoryHeader; char *url, *attributes; if (!rReadHtml(&htmlBuffer,&htmlBufferSize,DocumentIndexPost.RepositoryPointer,DocumentIndexPost.htmlSize2,DocID,subname,&ReposetoryHeader,&acl_allowbuffer,&acl_deniedbuffer,DocumentIndexPost.imageSize, &url, &attributes)) { printf("rReadHtml: did not returne true!\n"); return; } printf("Entire url: %s\n", url); #ifdef BLACK_BOX printf("acl allow raw: \"%s\"\n",acl_allowbuffer); printf("acl denied raw: \"%s\"\n",acl_deniedbuffer); printf("acl allow resolved: \"%s\"\n",aclResolv(acl_allowbuffer)); printf("acl denied resolved: \"%s\"\n",aclResolv(acl_deniedbuffer)); printf("PopRank: %d\n", ReposetoryHeader.PopRank); #endif if (optShowhtml) { printf("html uncompresed size %i\n",htmlBufferSize); printf("html buff:\n*******************************\n"); fwrite(htmlBuffer,htmlBufferSize,1,stdout); printf("\n*******************************\n\n"); } if (optShowWords) { printf("words:\n"); //run_html_parser( DocumentIndexPost.Url, htmlBuffer, htmlBufferSize, fn ); char *title, *body; html_parser_run(url,htmlBuffer, htmlBufferSize,&title, &body,fn,NULL ); } if (optResource) { char buf[500000]; printf("Resource:\n"); printf("Ptr: 0x%x Len: %x\n", DocumentIndexPost.ResourcePointer, DocumentIndexPost.ResourceSize); if (getResource(rLotForDOCid(DocID), subname, DocID, buf, sizeof(buf)) == 0) { printf("\tDid not get any resource\n"); warn(""); } else { printf("%s\n", buf); } } printf("attributes:\"%s\"\n", attributes); free(url); free(attributes); free(acl_allowbuffer); free(acl_deniedbuffer); } else { printf("Cant read post\n"); } #ifndef BLACK_BOX if (optAdult) { int httpResponsCodes[nrOfHttpResponsCodes]; //char *title; //char *body; struct adultFormat *adult; struct pagewordsFormat *pagewords = malloc(sizeof(struct pagewordsFormat)); int AdultWeight; unsigned char langnr; if ((adult = malloc(sizeof(struct adultFormat))) == NULL) { perror("malloc argstruct.adult"); exit(1); } wordsInit(pagewords); langdetectInit(); adultLoad(adult); AdultWeight -1; handelPage(pagewords,&ReposetoryHeader,htmlBuffer,htmlBufferSize,&title,&body); wordsMakeRevIndex(pagewords,adult,&AdultWeight,&langnr); printf("adult %i\n",AdultWeight); } if (optAnchor) { int anchorBufferSize; char *anchorBuffer; anchorBufferSize = anchorRead(rLotForDOCid(DocID),subname,DocID,NULL,-1); anchorBufferSize += 1; anchorBuffer = malloc(anchorBufferSize); anchorRead(rLotForDOCid(DocID),subname,DocID,anchorBuffer,anchorBufferSize); printf("#######################################\nanchors:\n%s\n#######################################\n",anchorBuffer); free(anchorBuffer); } if (optPopRank) { popopen (&popindex,"/home/boitho/config/popindex"); PopRanindex = popRankForDocID(&popindex,DocID); popclose(&popindex); printf("popindex %i\n",PopRanindex); if (popopen (&popextern,"/home/boitho/config/popextern")) { PopRankextern = popRankForDocID(&popextern,DocID); printf("PopRankextern: %i\n",PopRankextern); popclose(&popextern); } if (popopen (&popintern,"/home/boitho/config/popintern")) { PopRankintern = popRankForDocID(&popintern,DocID); printf("PopRankintern %i\n",PopRankintern); popclose(&popintern); } if (popopen (&popnoc,"/home/boitho/config/popnoc")) { PopRanknoc = popRankForDocID(&popnoc,DocID); printf("PopRanknoc %i\n",PopRanknoc); popclose(&popnoc); } if (popopen (&popindex,"/home/boitho/config/popindex")) { PopRanindex = popRankForDocID(&popindex,DocID); printf("popindex %i\n",PopRanindex); popclose(&popindex); } printf("PopRankextern: %i\nPopRankintern %i\nPopRanknoc %i\n",PopRankextern,PopRankintern,PopRanknoc); int brank; popopenMemArray_oneLot(subname,rLotForDOCid(DocID)); brank = popRankForDocIDMemArray(DocID); printf("brank %i\n",brank); //short rank if ( (FH = fopen(SHORTPOPFILE,"rb")) == NULL ) { perror("open"); } else { if ((fseek(FH,DocID* sizeof(ShortRank),SEEK_SET) == 0) && (fread(&ShortRank,sizeof(ShortRank),1,FH) != 0)) { printf("Short rank %u\n",(unsigned char)ShortRank); } else { printf("no hort rank avalibal\n"); }; fclose(FH); } } // if optPopRank #endif }