/** * Nomalize charset in HTTP request line and HTTP header(s). * Returns 0 on success, -1 (non-zero) on error. * * FIXME: Should handle/consider partial success? * * @param r Apache request object structure * @param cd Conversion descriptor, made by iconv_open(3). */ static int iconv_header(request_rec *r, iconv_t cd) { char *buff; char *keys[] = { "Destination", NULL }; int i; /* Normalize encoding in HTTP request line */ ap_unescape_url(r->unparsed_uri); if ((buff = iconv_string(r, cd, r->unparsed_uri, strlen(r->unparsed_uri))) == NULL) return -1; ap_parse_uri(r, buff); ap_getparents(r->uri); /* normalize given path for security */ /* Normalize encoding in HTTP request header(s) */ for (i = 0 ; keys[i] ; i++) { if ((buff = (char *)ap_table_get(r->headers_in, keys[i])) != NULL) { ap_unescape_url(buff); if ((buff = iconv_string(r, cd, buff, strlen(buff))) == NULL) return -1; ap_table_set(r->headers_in, keys[i], buff); } } return 0; }
/// get page from network bool getPage(const char* url, int method, std::string& response) { // Todo : add proxy and authentication support response.clear(); string strurl = url; boost::regex regspace(" "); strurl = boost::regex_replace(strurl, regspace, "%20"); HttpWrap *phttpwrap = new HttpWrap(); response += phttpwrap->Get(strurl.c_str()); delete phttpwrap; phttpwrap = NULL; if (response.length() < 200) { return false; } else { boost::regex regRn("\\r\\n"); response = boost::regex_replace(response, regRn, "\\n"); boost::regex reg1("<meta(.*?)charset=(.*?)>",boost::regex::icase); boost::smatch what; std::string::const_iterator start = response.begin(); std::string::const_iterator end1 = response.end(); if( boost::regex_search(start, end1, what, reg1) ) { string strcharset(what[2].first,what[2].second); if((int)strcharset.length() == 0) { if(!IsUTF8(response.c_str(),response.length())) { string strtmp = ""; iconv_string("gbk","utf-8", response.c_str(), response.length(),strtmp,1); response = strtmp; } } else { if ((int)strcharset.find("UTF") >= 0 || (int)strcharset.find("utf") >= 0) { ; } else { string strtmp = ""; iconv_string("gbk","utf-8", response.c_str(), response.length(),strtmp,1); response = strtmp; } } } else if(!IsUTF8(response.c_str(),response.length())/*pSpiderconf->m_config.ruleCollections[pSpiderconf->m_config.plans[m_plan_id].collectionRuleId].charset != "utf-8"*/) { string strtmp = ""; iconv_string("gbk","utf-8", response.c_str(), response.length(),strtmp,1); response = strtmp; } } return true; }
void main() { const char* s = "пÑ_иÐ_Ð÷Ñ'"; char* result = NULL; if (iconv_string("CP1251","UTF-8",s, s+strlen(s)+1, &result, NULL) < 0) perror("iconv_string"); printf("res:%s",result); }res:(null)
/* ----------------------------------------------------------------------- */ void mpeg_parse_psi_string(char *src, int slen, char *dest, int dlen) { char *tmp; int tlen ; unsigned ch = 0; unsigned first_byte = (unsigned)src[0] ; //fprintf(stderr, "mpeg_parse_psi_string src len=%d [0x%02x ..], dest len=%d\n", slen, first_byte, dlen) ; if (first_byte < 0x20) { ch = first_byte; src++; slen--; } memset(dest,0,dlen); //fprintf(stderr, " + ch = 0x%02x\n", ch) ; if (ch < 0x10) { //fprintf(stderr, " + handle_control_8()\n") ; /* 8bit charset */ tmp = malloc(slen); //fprintf(stderr, " + + malloc() %p\n", tmp) ; tlen = handle_control_8(src, slen, tmp, slen); //fprintf(stderr, " + + calling iconv_string() ...\n") ; iconv_string(psi_charset[ch], "UTF-8", tmp, tlen, dest, dlen); //fprintf(stderr, " + + free() %p\n", tmp) ; free(tmp); } else { //fprintf(stderr, " + iconv()\n") ; /* 16bit charset */ iconv_string(psi_charset[ch], "UTF-8", src, slen, dest, dlen); } //fprintf(stderr, "mpeg_parse_psi_string - DONE\n") ; }
/** * stringprep_convert - encode string using new character set * @str: input zero-terminated string. * @to_codeset: name of destination character set. * @from_codeset: name of origin character set, as used by @str. * * Convert the string from one character set to another using the * system's iconv() function. * * Return value: Returns newly allocated zero-terminated string which * is @str transcoded into to_codeset. **/ char * stringprep_convert (const char *str, const char *to_codeset, const char *from_codeset) { #if HAVE_ICONV return iconv_string (str, from_codeset, to_codeset); #else char *p; fprintf (stderr, "libidn: warning: libiconv not installed, cannot " "convert data to UTF-8\n"); p = malloc (strlen (str) + 1); if (!p) return NULL; return strcpy (p, str); #endif }
/* Convert a string in UTF-8 to <legacy_encoding>. Returns NULL if conversion * failed. */ char *str_utf8_to_legacy(const char *string, const char *legacy_encoding) { if(string == NULL || legacy_encoding == NULL) return NULL; /* convert the (possibly) decomposed utf-8 string to composed form (eg, * Ä is converted to a single precomposed character instead of a base * character with a combining accent). This is required for the following * conversion to Windows-1252 legacy_encoding. */ char *utf8_composed_string = g_utf8_normalize(string, -1, G_NORMALIZE_DEFAULT_COMPOSE); if(utf8_composed_string == NULL) { return NULL; } char *legacy_string = iconv_string(utf8_composed_string, -1, "UTF-8", legacy_encoding); free(utf8_composed_string); return legacy_string; }
int main (int ac, char *av[]) { char *in = NULL, *out = NULL; char *to = NULL, *from = NULL; if (ac > 1) from = av[1]; if (ac > 2) to = av[2]; if (ac > 3) in = av[3]; if (!in) { size_t len = 0; printf ("Enter string to convert:\n\t> "); if (getline (&in, &len, stdin) < 0) perror ("getline"); if (in[strlen (in) - 1] == '\n') in[strlen (in) - 1] = '\0'; } if (!to) { size_t len = 0; printf ("Enter destination code set:\n\t> "); if (getline (&to, &len, stdin) < 0) perror ("getline"); if (to[strlen (to) - 1] == '\n') to[strlen (to) - 1] = '\0'; } if (!from) { size_t len = 0; printf ("Enter source code set:\n\t> "); if (getline (&from, &len, stdin) < 0) perror ("getline"); if (from[strlen (from) - 1] == '\n') from[strlen (from) - 1] = '\0'; } printf (" Input string: `%s'\n" "From code set: `%s'\n" " To code set: `%s'\n", in, from, to); out = iconv_string (in, from, to); if (out == NULL) perror ("iconv"); else { printf ("\nOutput: `%s'\n", out); free (out); } return EXIT_SUCCESS; }
int iconv_string (const char* tocode, const char* fromcode, const char* start, const char* end, char** resultp, size_t* lengthp) { iconv_t cd = iconv_open(tocode,fromcode); size_t length; char* result; if (cd == (iconv_t)(-1)) { if (errno != EINVAL) return -1; /* Unsupported fromcode or tocode. Check whether the caller requested autodetection. */ if (!strcmp(fromcode,"autodetect_utf8")) { int ret; /* Try UTF-8 first. There are very few ISO-8859-1 inputs that would be valid UTF-8, but many UTF-8 inputs are valid ISO-8859-1. */ ret = iconv_string(tocode,"UTF-8",start,end,resultp,lengthp); if (!(ret < 0 && errno == EILSEQ)) return ret; ret = iconv_string(tocode,"ISO-8859-1",start,end,resultp,lengthp); return ret; } if (!strcmp(fromcode,"autodetect_jp")) { int ret; /* Try 7-bit encoding first. If the input contains bytes >= 0x80, it will fail. */ ret = iconv_string(tocode,"ISO-2022-JP-2",start,end,resultp,lengthp); if (!(ret < 0 && errno == EILSEQ)) return ret; /* Try EUC-JP next. Short SHIFT_JIS inputs may come out wrong. This is unavoidable. People will condemn SHIFT_JIS. If we tried SHIFT_JIS first, then some short EUC-JP inputs would come out wrong, and people would condemn EUC-JP and Unix, which would not be good. */ ret = iconv_string(tocode,"EUC-JP",start,end,resultp,lengthp); if (!(ret < 0 && errno == EILSEQ)) return ret; /* Finally try SHIFT_JIS. */ ret = iconv_string(tocode,"SHIFT_JIS",start,end,resultp,lengthp); return ret; } if (!strcmp(fromcode,"autodetect_kr")) { int ret; /* Try 7-bit encoding first. If the input contains bytes >= 0x80, it will fail. */ ret = iconv_string(tocode,"ISO-2022-KR",start,end,resultp,lengthp); if (!(ret < 0 && errno == EILSEQ)) return ret; /* Finally try EUC-KR. */ ret = iconv_string(tocode,"EUC-KR",start,end,resultp,lengthp); return ret; } errno = EINVAL; return -1; } /* Determine the length we need. */ { size_t count = 0; char tmpbuf[tmpbufsize]; const char* inptr = start; size_t insize = end-start; while (insize > 0) { char* outptr = tmpbuf; size_t outsize = tmpbufsize; size_t res = iconv(cd,&inptr,&insize,&outptr,&outsize); if (res == (size_t)(-1)) { if (errno == EINVAL) break; else { int saved_errno = errno; iconv_close(cd); errno = saved_errno; return -1; } } count += outptr-tmpbuf; } { char* outptr = tmpbuf; size_t outsize = tmpbufsize; size_t res = iconv(cd,NULL,NULL,&outptr,&outsize); if (res == (size_t)(-1)) { int saved_errno = errno; iconv_close(cd); errno = saved_errno; return -1; } count += outptr-tmpbuf; } length = count; } if (lengthp != NULL) *lengthp = length; if (resultp == NULL) { iconv_close(cd); return 0; } result = (*resultp == NULL ? malloc(length) : realloc(*resultp,length)); *resultp = result; if (length == 0) { iconv_close(cd); return 0; } if (result == NULL) { iconv_close(cd); errno = ENOMEM; return -1; } iconv(cd,NULL,NULL,NULL,NULL); /* return to the initial state */ /* Do the conversion for real. */ { const char* inptr = start; size_t insize = end-start; char* outptr = result; size_t outsize = length; while (insize > 0) { size_t res = iconv(cd,&inptr,&insize,&outptr,&outsize); if (res == (size_t)(-1)) { if (errno == EINVAL) break; else { int saved_errno = errno; iconv_close(cd); errno = saved_errno; return -1; } } } { size_t res = iconv(cd,NULL,NULL,&outptr,&outsize); if (res == (size_t)(-1)) { int saved_errno = errno; iconv_close(cd); errno = saved_errno; return -1; } } if (outsize != 0) abort(); } iconv_close(cd); return 0; }
int main(int argc, char* argv[]) { QINIU_ACCESS_KEY = "sn7d6X2kmRQKkNyO0_ZY_Hz2utVrXIeEmc8QutVC"; QINIU_SECRET_KEY = "jeQSWafTp7kczgR4qVa-erKOaHk0_qcvMNacxO8E"; curl_global_init(CURL_GLOBAL_ALL); Qiniu_Global_Init(-1); /* 全局初始化函数,整个进程只需要调用一次 */ if(strcmp(argv[1],"imgtest") == 0 && argc == 4) { string oldurl,newurl,articleurl; articleurl = argv[2]; oldurl = argv[3]; newurl = _expandlinks(articleurl,oldurl); printf("newurl=%s\n",oldurl.c_str()); } if(strcmp(argv[1],"-start") == 0 && argc == 4) { int ntime = atoi(argv[2]); int nprocesstimes = 1; ArticleManage m_article; theLog.SetLogFilePath(GetFullPath()); if(!ReadConfigFile((GetFullPath()+"/sqlconfig.conf").c_str(),p_config)) { return 0; } if(!ReadSpiderRulerConfigFile((GetFullPath()+"/spiderruler.conf").c_str(),p_spiderruler)) { return 0; } while(1) { m_article.start(atoi(argv[3])); theLog.WriteLog(LOG_LEVEL_SYS,"the proc ArticleExtract %d times completed!",nprocesstimes); nprocesstimes++; sleep(ntime * 60); } } if(argc == 3 && strcmp(argv[1],"-start") != 0 ) { if(!ReadSpiderRulerConfigFile((GetFullPath()+"/spiderruler.conf").c_str(),p_spiderruler)) { return 0; } string page; list<string> ImgStrList; page.clear(); const char* url = argv[1]; string sourcepagename = argv[2]; //const char* url = "http://admin.wechat.com/mp/appmsg/show?__biz=MjM5MTIwODcxNA==&appmsgid=10001872&itemidx=1&sign=d5997fecd12a3af79f8c8d65600f82a1"; printf("url=%s\n",url); string urlstr = url; if(0 == sourcepagename.compare("zatu")) { string strtmp = ""; iconv_string("utf-8","gbk", urlstr.c_str(), urlstr.length(),strtmp,1); urlstr = strtmp; printf("urlstr=%s\n",urlstr.c_str()); } int method = 0; if(!getPage(urlstr.c_str(), method,page)) { printf("不能获取URL内容\n"); return 0; } string content; string Introduction; string publishtime; string titlestr; string contentimg; page = mainpagetagclean(page); if(page.length() < 2048) { printf("the page source length too short ! \n"); return 0; } printf("page length=%d\n",page.length()); HtmlExtract sorceExtract(page,p_spiderruler[sourcepagename]); sorceExtract.Extract(); titlestr = sorceExtract.GetTitle(); boost::regex title_reg("((?i)( ))"); titlestr = boost::regex_replace(titlestr,title_reg,""); printf("title=%s\n",titlestr.c_str()); content = sorceExtract.ArticleContent; publishtime = sorceExtract.GetDateTime(); Introduction = sorceExtract.Introduction; strltrim(publishtime); publishtime = publish_time_deal(publishtime); if(0 == sourcepagename.compare("geekpark")) //极客公园特殊处理 { //boost::regex reg14("((?i)(<div\\s{1,4}class=\"yarpp-related\".*?</div>))"); boost::regex reg14("((?i)(<div\\s{1,4}(id=\"tags\").*?</div>))"); content = boost::regex_replace(content,reg14,""); } if(0 == sourcepagename.compare("cuntuba")) //苹果网特殊处理 { //boost::regex reg14("((?i)(<div\\s{1,4}class=\"yarpp-related\".*?</div>))"); boost::regex reg14("((?i)(<div\\s{1,4}(?!class=\"cont\").*?</div>))"); content = boost::regex_replace(content,reg14,""); } if(0 == sourcepagename.compare("macx")) //苹果网特殊处理 { //boost::regex reg14("((?i)(<div\\s{1,4}class=\"yarpp-related\".*?</div>))"); boost::regex reg14("((?i)(<div\\s{1,4}(?!class=\"v2-t_fsz\").*?</div>))"); content = boost::regex_replace(content,reg14,""); reg14.assign("(?i)(<a[^>]*>.*?</a>)"); content = boost::regex_replace(content,reg14,""); } if(0 == sourcepagename.compare("leiphone")) //雷锋网尾部特殊处理 { //boost::regex reg14("((?i)(<div\\s{1,4}class=\"yarpp-related\".*?</div>))"); boost::regex reg14("((?i)(<div\\s{1,4}(?!class=\"post_content\").*?</div>))"); content = boost::regex_replace(content,reg14,""); reg14.assign("(?i)(<div>.*?</div>)"); content = boost::regex_replace(content,reg14,""); } if(0 == sourcepagename.compare("tech163")) //网易科技尾部特殊处理 { //boost::regex reg14("((?i)(<div\\s{1,4}class=\"yarpp-related\".*?</div>))"); boost::regex reg14("((?i)(<div\\s{1,4}(?!id=\"endtext\").*?</div>))"); content = boost::regex_replace(content,reg14,""); } if(0 == sourcepagename.compare("pingwest")) //pingwest尾部特殊处理 { //boost::regex reg14("((?i)(<div\\s{1,4}class=\"yarpp-related\".*?</div>))"); boost::regex reg15("((?i)(<div.*?>[^<]+</div>))"); content = boost::regex_replace(content,reg15,""); } if(0 == sourcepagename.compare("zatu")) //杂图天下尾部特殊处理 { //boost::regex reg14("((?i)(<div\\s{1,4}class=\"yarpp-related\".*?</div>))"); boost::regex reg14("((?i)(<div\\s{1,4}(?!class=\"format_text entry-content\").*?</div>))"); content = boost::regex_replace(content,reg14,""); } if(0 == sourcepagename.compare("jandan")) { boost::regex reg11("((?i)(<a[^>]*>.*?</a>))"); content = boost::regex_replace(content,reg11,""); boost::regex reg12("((?i)(<span[^>]*>.*?</span>))"); content = boost::regex_replace(content,reg12,""); } if(0 == sourcepagename.compare("guaixun")) { boost::regex reg13("((?i)(<div style=\"position:absolute.*?</div>))"); content = boost::regex_replace(content,reg13,""); } content = maincontenttagclean(content); strltrim(content); boost::smatch m; boost::regex reg8; if(sourcepagename == "sinablogit") { reg8.assign("(?i)( src\\s{0,2}=\\s{0,2}\"([^\"]*)\")"); content = boost::regex_replace(content,reg8,""); reg8.assign("(?i)( real_src =)"); content = boost::regex_replace(content,reg8," src="); } else if(sourcepagename == "aqee") { reg8.assign("(?i)( src\\s{0,2}=\\s{0,2}\"([^\"]*)\")"); content = boost::regex_replace(content,reg8,""); reg8.assign("(?i)( data-original=)"); content = boost::regex_replace(content,reg8," src="); } else if(sourcepagename == "macx") { reg8.assign("(?i)( src\\s{0,2}=\\s{0,2}\"([^\"]*)\")"); content = boost::regex_replace(content,reg8,""); reg8.assign("(?i)( zoomfile=)"); content = boost::regex_replace(content,reg8," src="); } if(0 == sourcepagename.compare("cuntuba")) //寸土吧特殊处理 { reg8.assign("(?i)(\')"); content = boost::regex_replace(content,reg8,"\""); } reg8.assign("(?i)( src\\s{0,2}=\\s{0,2}\"([^\"]*)\")"); contentimg = content; boost::regex reg("((?i)<((?!img)[^>]*>))"); contentimg = boost::regex_replace(contentimg,reg,""); //boost::regex reg8("(?i)(src=\"([^\"]*)\")"); std::string::const_iterator start = contentimg.begin(); std::string::const_iterator end = contentimg.end(); try { while(boost::regex_search(start,end,m,reg8)) { if (m[0].matched) { string tempurl(m[0].first,m[0].second); string regurl; tempurl = tempurl.substr(tempurl.find_first_of('\"',0)+1,tempurl.find_last_of('\"')-tempurl.find_first_of('\"',0)-1); //if( 0 == sourcepagename.compare("zatu") ) // { regurl = _expandlinks(urlstr,tempurl); // } if(0 != tempurl.length()) { boost::regex reg(tempurl); content = boost::regex_replace(content,reg,regurl); ImgStrList.push_back(regurl); } start = m[0].second; } } } catch (const boost::bad_expression& e) { theLog.WriteLog(LOG_LEVEL_ERROR,"cann't create regex with %s!",urlstr.c_str()); } list<string>::iterator it; for( it = ImgStrList.begin(); it != ImgStrList.end(); it++) { printf("%s\n",(*it).c_str()); } if( 0 == Introduction.length() ) { boost::smatch m1; boost::regex reg("(?i)(<p>.*?</p>)"); std::string::const_iterator start = content.begin(); std::string::const_iterator end = content.end(); while(boost::regex_search(start,end,m1,reg)) { if (m1[0].matched) { Introduction = m1[0].str(); } break; } int pos; if( 0 == Introduction.length() ) { if(-1 != (pos = content.find_first_of("\x0d\x0a",0))) { Introduction = content.substr(0,pos); } } } Introduction = Introductioncontenttagclean(Introduction); strltrim(Introduction); if( 0 == p_spiderruler[sourcepagename].summaryisinbody.compare("yes") ) { content = Introduction + content; } printf("publishtime=%s\n",publishtime.c_str()); printf("Introduction=%s\n",Introduction.c_str()); printf("content=%s\n",content.c_str()); } Qiniu_Global_Cleanup(); /* 全局清理函数,只需要在进程退出时调用一次 */ return 0; }