iconv_t gbiconv_open( char *tocode, char *fromcode) { // get hash for to/from uint32_t hash1 = hash32Lower_a(tocode, gbstrlen(tocode), 0); uint32_t hash2 = hash32Lower_a(fromcode, gbstrlen(fromcode),0); uint32_t hash = hash32h(hash1, hash2); g_errno = 0; iconv_t *convp = (iconv_t *)s_convTable.getValue(&hash); iconv_t conv = NULL; if ( convp ) conv = *convp; //log(LOG_DEBUG, "uni: convertor %s -> %s from hash 0x%"XINT32": 0x%"XINT32"", // fromcode, tocode, // hash, conv); if (!conv){ //log(LOG_DEBUG, "uni: Allocating new convertor for " // "%s to %s (hash: 0x%"XINT32")", // fromcode, tocode,hash); conv = iconv_open(tocode, fromcode); if (conv == (iconv_t) -1) { log(LOG_WARN, "uni: failed to open converter for " "%s to %s: %s (%d)", fromcode, tocode, strerror(errno), errno); // need to stop if necessary converters don't open //char *xx=NULL; *xx = 0; g_errno = errno; if (errno == EINVAL) g_errno = EBADCHARSET; return conv; } // add mem to table to keep track g_mem.addMem((void*)conv, 52, "iconv", 1); // cache convertor s_convTable.addKey(&hash, &conv); //log(LOG_DEBUG, "uni: Saved convertor 0x%"INT32" under hash 0x%"XINT32"", // conv, hash); } else{ // reset convertor char *dummy = NULL; size_t dummy2 = 0; // JAB: warning abatement //size_t res = iconv(conv,NULL,NULL,&dummy,&dummy2); iconv(conv,NULL,NULL,&dummy,&dummy2); } return conv; }
void Blaster::gotDoc2 ( void *state, TcpSocket *s){ StateBD *st=(StateBD *)state; // bail if got cut off if ( s->m_readOffset == 0 ) { log("blaster: Lost the Request in gotDoc2"); m_launched--; //No need to point p2 // Free stateBD freeStateBD(st); return; } // . don't let TcpServer free m_buf when socket is recycled/closed // . we own it now and are responsible for freeing it // s->m_readBuf = NULL; long long now = gettimeofdayInMilliseconds(); // So now after getting both docIds, get their contents char *reply1 = st->m_buf1 ; long size1 = st->m_buf1Len; HttpMime mime1; mime1.set ( reply1 , size1 , NULL ); char *content1 = reply1 + mime1.getMimeLen(); long content1Len = size1 - mime1.getMimeLen(); unsigned long h = hash32 ( content1 , content1Len ); // log msg if ( g_errno ) logf(LOG_INFO,"blaster: got doc (%li) (%li ms) %s : %s", s->m_readOffset , (long)(now - s->m_startTime) , st->m_u2 , mstrerror(g_errno) ); else logf(LOG_INFO,"blaster: got doc (%li) (%li ms) " "(hash=%lx) %s", s->m_readOffset , (long)(now - s->m_startTime) , h , st->m_u2 ); if (m_verbose){ log(LOG_WARN,"blaster: content1len=%li, Content1 is =%s", content1Len,content1); log(LOG_WARN,"\n"); } char *reply2 = s->m_readBuf ; long size2 = s->m_readOffset; HttpMime mime2; mime2.set ( reply2 , size2 , NULL ); char *content2 = reply2 + mime2.getMimeLen(); long content2Len = size2 - mime2.getMimeLen(); if (m_verbose) log(LOG_WARN,"blaster: content2len=%li, Content2 is =%s", content2Len,content2); // Now that we've got the contents, lets get the url links out // of these pages.Passing them to function getSearchLinks should // get the first x links found out. /* st->m_links1=(char *) mmalloc(200*MAX_URL_LEN,"Blaster3"); st->m_links2=st->m_links1+100*MAX_URL_LEN; st->m_numLinks1=100; st->m_numLinks2=100;*/ /* long numLinks1=getSearchLinks(content1,content1Len, st->m_links1,st->m_numLinks1); long numLinks2=getSearchLinks(content2,content2Len, st->m_links2,st->m_numLinks2);*/ content1[content1Len]='\0'; //short csEnum1= get_iana_charset(mime1.getCharset(), // mime1.getCharsetLen()); /* if (csEnum1== csUnknown) log(LOG_DEBUG, "blaster: Unknown charset : %s", mime2.getCharset());*/ Xml xml1; // assume utf8 if (!xml1.set(content1, content1Len, false, 0, false, TITLEREC_CURRENT_VERSION)){ log(LOG_WARN,"blaster: Couldn't set XML1 Class in gotDoc2"); } Links links1; Url parent; parent.set ( st->m_u1); if (!links1.set(false , // userellnofollow &xml1, &parent,//mime1.getLocationUrl(), parent Url false, // setLinkHashes NULL , // baseUrl TITLEREC_CURRENT_VERSION, // version 0 , // niceness false , // parent is permalink? NULL )) { // oldLinks log(LOG_WARN,"blaster: Couldn't set Links Class in gotDoc2"); } content2[content2Len]='\0'; //short csEnum2= get_iana_charset(mime2.getCharset(), // mime2.getCharsetLen()); /* if (csEnum2== csUnknown) log(LOG_DEBUG, "blaster: Unknown charset : %s", mime2.getCharset());*/ Xml xml2; if (!xml2.set(content2, content2Len, false, 0, false, TITLEREC_CURRENT_VERSION)){ log(LOG_WARN,"blaster: Couldn't set XML2 Class in gotDoc2"); } Links links2; parent.set(st->m_u2); if (!links2.set(0,//siterec xml &xml2, &parent,//&st->m_u2,//mime2.getLocationUrl(), false, NULL, TITLEREC_CURRENT_VERSION, 0, false, NULL)){ log(LOG_WARN,"blaster: Couldn't set links2 Class in gotDoc2"); } // put the hash of the sites into a hashtable, since we have // about a 100 or so of them HashTableT<unsigned long, bool> urlHash; // put the urls from doc2 into the hastable, but first check if // they are links to google or gigablast (for now). For msn and // yahoo we have to add other checks. char domain2[256]; long dlen = 0; char *dom = getDomFast ( st->m_u2 , &dlen ); if ( dom ) strncpy(domain2,dom,dlen); domain2[dlen]='\0'; for (long i=0;i<links2.getNumLinks();i++){ // The dots check if exactly google or gigablast are present // in the link char *ss=links2.getLink(i); char *p; p=strstr(ss,domain2); if(p) continue; p=strstr(ss,"google."); if(p) continue; p=strstr(ss,"cache:"); //googles cache page if(p) continue; p= strstr(ss,"gigablast."); if(p) continue; p= strstr(ss,"web.archive.org");//older copies on gigablast if(p) continue; p= strstr(ss,"search.yahoo.com");//from gigablast search if(p) continue; p= strstr(ss,"search.msn.com");//from gigablast search if(p) continue; p= strstr(ss,"s.teoma.com");//from gigablast search if(p) continue; p= strstr(ss,"search.dmoz.org");//from gigablast search if(p) continue; p= strstr(ss,"www.answers.com");//from gigablast search if(p) continue; p= strstr(ss,"cc.msncache.com");//msn's cache page if(p) continue; if (m_verbose) log(LOG_WARN,"blaster: link in Doc2=%s" ,links2.getLink(i)); unsigned long h=hash32Lower_a(links2.getLink(i), links2.getLinkLen(i)); //should i check for conflict. no, because it doesn't matter urlHash.addKey(h,1); } // now check if the urls from doc1 are in doc2. save the // ones that are not // in there for later. /* long numUrlsToCheck=links2.getNumLinks();*/ long numUrlsNotFound=0; /*if (numLinks1<numUrlsToCheck) numUrlsToCheck=numLinks1;*/ char domain1[256]; dlen = 0; dom = getDomFast ( st->m_u1 ,&dlen ); if ( dom ) strncpy(domain1,dom,dlen); domain1[dlen]='\0'; for (long i=0;i<links1.getNumLinks();i++){ char *ss=links1.getLink(i); char *p; p=strstr(ss,domain1); if(p) continue; p=strstr(ss,"google."); if(p) continue; p=strstr(ss,"cache:"); //googles cache page if(p) continue; p= strstr(ss,"gigablast."); if(p) continue; p= strstr(ss,"web.archive.org");//older copies on gigablast if(p) continue; p= strstr(ss,"search.yahoo.com");//from gigablast search if(p) continue; p= strstr(ss,"search.msn.com");//from gigablast search if(p) continue; p= strstr(ss,"s.teoma.com");//from gigablast search if(p) continue; p= strstr(ss,"search.dmoz.org");//from gigablast search if(p) continue; p= strstr(ss,"www.answers.com");//from gigablast search if(p) continue; p= strstr(ss,"cc.msncache.com");//msn's cache page if(p) continue; if (m_verbose) log(LOG_WARN,"blaster: link in Doc1=%s" ,links1.getLink(i)); unsigned long h=hash32Lower_a(links1.getLink(i), links1.getLinkLen(i)); long slot= urlHash.getSlot(h); if(slot!=-1) continue; // if url is not present, get its doc. if (m_verbose || m_justDisplay) log(LOG_WARN,"blaster: NOT FOUND %s in %s" ,links1.getLink(i),domain2); numUrlsNotFound++; //Don't do anything else if just have to display the urls if (m_justDisplay) continue; //now get the doc of these urls //initialize st->m_numUrlDocsReceived=0; StateBD2 *st2; try { st2 = new (StateBD2); } catch ( ... ) { g_errno = ENOMEM; log("blaster: Failed. " "Could not allocate %li bytes for query. " "Returning HTTP status of 500.", (long)sizeof(StateBD2)); return; } mnew ( st2 , sizeof(StateBD2) , "Blaster4" ); //Point to the big state; st2->m_st=st; //Msg16 does 6 redirects, so I do 6 too st2->m_numRedirects=6; //st2->m_url.set(links1.getLink(i),links1.getLinkLen(i)); st2->m_url = links1.getLink(i); // No need for a proxy ip here, since we are fetching // doc's from different IPs. Faster this way bool status = g_httpServer.getDoc ( st2->m_url, // url 0,//ip 0 , // offset -1 , // size 0 , // ifModifiedSince st2, // state gotDocWrapper3, // callback 60*1000, // timeout 0, // proxy ip 0, // proxy port 30*1024*1024, //maxLen 30*1024*1024);//maxOtherLen // continue if it blocked if ( ! status ) continue; // If not blocked, there is an error. st->m_numUrlDocsReceived++; } st->m_numUrlDocsSent=numUrlsNotFound; //There might have been an error while sending the docs, so if there //has been put a check if ( st->m_numUrlDocsReceived > 0 && st->m_numUrlDocsReceived <= st->m_numUrlDocsSent ){ log(LOG_WARN,"blaster: %li docs could not be sent due to " "error",st->m_numUrlDocsReceived); m_launched--; freeStateBD(st); return; } if (numUrlsNotFound==0){ //job done for this pair log(LOG_WARN,"blaster: All urls from %s found in " "%s",domain1,domain2); m_launched--; // Free stateBD freeStateBD(st); return; } log(LOG_WARN,"blaster: %li urls from %s Not found in %s", numUrlsNotFound,domain1,domain2); if(m_justDisplay){ m_launched--; // Free stateBD freeStateBD(st); } return; }