/**
 * Nomalize charset in HTTP request line and HTTP header(s).
 * Returns 0 on success, -1 (non-zero) on error.
 *
 * FIXME: Should handle/consider partial success?
 *
 * @param  r Apache request object structure
 * @param cd Conversion descriptor, made by iconv_open(3).
 */
static int
iconv_header(request_rec *r, iconv_t cd) {

  char *buff;
  char *keys[] = { "Destination", NULL };
  int   i;

  /* Normalize encoding in HTTP request line */
  ap_unescape_url(r->unparsed_uri);
  if ((buff = iconv_string(r, cd, r->unparsed_uri,
			   strlen(r->unparsed_uri))) == NULL)
    return -1;
  ap_parse_uri(r, buff);
  ap_getparents(r->uri); /* normalize given path for security */

  /* Normalize encoding in HTTP request header(s) */
  for (i = 0 ; keys[i] ; i++) {
    if ((buff = (char *)ap_table_get(r->headers_in, keys[i])) != NULL) {
      ap_unescape_url(buff);
      if ((buff = iconv_string(r, cd, buff, strlen(buff))) == NULL)
	return -1;
      ap_table_set(r->headers_in, keys[i], buff);
    }
  }

  return 0;
}
예제 #2
0
/// get page from network
bool getPage(const char* url, int method, std::string& response)
{
	// Todo : add proxy and authentication support
	response.clear();
	string strurl = url;
	boost::regex regspace(" ");
	strurl = boost::regex_replace(strurl, regspace, "%20");
	HttpWrap *phttpwrap = new HttpWrap();
	response += phttpwrap->Get(strurl.c_str());
	delete phttpwrap;
    phttpwrap = NULL;
	if (response.length() < 200)
	{
		return false;
	} 
	else
	{
		boost::regex regRn("\\r\\n");
		response = boost::regex_replace(response, regRn, "\\n");
		boost::regex reg1("<meta(.*?)charset=(.*?)>",boost::regex::icase); 
		boost::smatch what;
		std::string::const_iterator start = response.begin();
		std::string::const_iterator end1 = response.end();
		if( boost::regex_search(start, end1, what, reg1) )
		{
			string strcharset(what[2].first,what[2].second);
			if((int)strcharset.length() == 0)
			{
				if(!IsUTF8(response.c_str(),response.length()))
				{
					string strtmp = "";
					iconv_string("gbk","utf-8", response.c_str(), response.length(),strtmp,1);
					response = strtmp;

				}
			}
			else 
			{
				if ((int)strcharset.find("UTF") >= 0 || (int)strcharset.find("utf") >= 0)
				{
					;
				} 
				else
				{
					string strtmp = "";
					iconv_string("gbk","utf-8", response.c_str(), response.length(),strtmp,1);
					response = strtmp;
				}
			}
		}
		else if(!IsUTF8(response.c_str(),response.length())/*pSpiderconf->m_config.ruleCollections[pSpiderconf->m_config.plans[m_plan_id].collectionRuleId].charset != "utf-8"*/)
		{
			string strtmp = "";
			iconv_string("gbk","utf-8", response.c_str(), response.length(),strtmp,1);
			response = strtmp;

		}
	}
	return true;
}
예제 #3
0
void main() {



  const char* s = "пÑ_иÐ_Ð÷Ñ'";
  char* result = NULL;
  if (iconv_string("CP1251","UTF-8",s, s+strlen(s)+1, &result, NULL) < 0)
    perror("iconv_string");

 printf("res:%s",result);


}res:(null)
예제 #4
0
/* ----------------------------------------------------------------------- */
void mpeg_parse_psi_string(char *src, int slen, char *dest, int dlen)
{
    char *tmp;
    int tlen ;
    unsigned ch = 0;
    unsigned first_byte = (unsigned)src[0] ;

//fprintf(stderr, "mpeg_parse_psi_string src len=%d [0x%02x ..], dest len=%d\n", slen, first_byte, dlen) ;

    if (first_byte < 0x20) {
		ch = first_byte;
		src++;
		slen--;
    }

    memset(dest,0,dlen);

//fprintf(stderr, " + ch = 0x%02x\n", ch) ;

    if (ch < 0x10) {
//fprintf(stderr, " + handle_control_8()\n") ;
		/* 8bit charset */
		tmp = malloc(slen);
//fprintf(stderr, " + + malloc() %p\n", tmp) ;
		tlen = handle_control_8(src, slen, tmp, slen);
//fprintf(stderr, " + + calling iconv_string() ...\n") ;
		iconv_string(psi_charset[ch], "UTF-8", tmp, tlen, dest, dlen);
//fprintf(stderr, " + + free() %p\n", tmp) ;
		free(tmp);
    } else {
//fprintf(stderr, " + iconv()\n") ;
		/* 16bit charset */
		iconv_string(psi_charset[ch], "UTF-8", src, slen, dest, dlen);
    }
//fprintf(stderr, "mpeg_parse_psi_string - DONE\n") ;
}
예제 #5
0
/**
 * stringprep_convert - encode string using new character set
 * @str: input zero-terminated string.
 * @to_codeset: name of destination character set.
 * @from_codeset: name of origin character set, as used by @str.
 *
 * Convert the string from one character set to another using the
 * system's iconv() function.
 *
 * Return value: Returns newly allocated zero-terminated string which
 *   is @str transcoded into to_codeset.
 **/
char *
stringprep_convert (const char *str,
		    const char *to_codeset, const char *from_codeset)
{
#if HAVE_ICONV
  return iconv_string (str, from_codeset, to_codeset);
#else
  char *p;
  fprintf (stderr, "libidn: warning: libiconv not installed, cannot "
	   "convert data to UTF-8\n");
  p = malloc (strlen (str) + 1);
  if (!p)
    return NULL;
  return strcpy (p, str);
#endif
}
예제 #6
0
파일: encoding.c 프로젝트: Reve/Shakespeer
/* Convert a string in UTF-8 to <legacy_encoding>.  Returns NULL if conversion
 * failed.
 */
char *str_utf8_to_legacy(const char *string, const char *legacy_encoding)
{
    if(string == NULL || legacy_encoding == NULL)
        return NULL;

    /* convert the (possibly) decomposed utf-8 string to composed form (eg,
     * &Auml; is converted to a single precomposed character instead of a base
     * character with a combining accent). This is required for the following
     * conversion to Windows-1252 legacy_encoding.
     */
    char *utf8_composed_string = g_utf8_normalize(string, -1, G_NORMALIZE_DEFAULT_COMPOSE);
    if(utf8_composed_string == NULL)
    {
        return NULL;
    }

    char *legacy_string = iconv_string(utf8_composed_string, -1,
            "UTF-8", legacy_encoding);
    free(utf8_composed_string);

    return legacy_string;
}
예제 #7
0
int main (int ac, char *av[])
{
  char *in = NULL, *out = NULL;
  char *to = NULL, *from = NULL;

  if (ac > 1)
    from = av[1];

  if (ac > 2)
    to = av[2];

  if (ac > 3)
    in = av[3];

  if (!in)
    {
      size_t len = 0;
      printf ("Enter string to convert:\n\t> ");
      if (getline (&in, &len, stdin) < 0)
        perror ("getline");
      if (in[strlen (in) - 1] == '\n')
        in[strlen (in) - 1] = '\0';
    }

  if (!to)
    {
      size_t len = 0;
      printf ("Enter destination code set:\n\t> ");
      if (getline (&to, &len, stdin) < 0)
        perror ("getline");
      if (to[strlen (to) - 1] == '\n')
        to[strlen (to) - 1] = '\0';
    }

  if (!from)
    {
      size_t len = 0;
      printf ("Enter source code set:\n\t> ");
      if (getline (&from, &len, stdin) < 0)
        perror ("getline");
      if (from[strlen (from) - 1] == '\n')
        from[strlen (from) - 1] = '\0';
    }

  printf (" Input string: `%s'\n"
          "From code set: `%s'\n"
          "  To code set: `%s'\n",
          in, from, to);

  out = iconv_string (in, from, to);

  if (out == NULL)
    perror ("iconv");
  else
    {
      printf ("\nOutput: `%s'\n", out);
      free (out);
    }

  return EXIT_SUCCESS;
}
예제 #8
0
int iconv_string (const char* tocode, const char* fromcode,
                  const char* start, const char* end,
                  char** resultp, size_t* lengthp)
{
  iconv_t cd = iconv_open(tocode,fromcode);
  size_t length;
  char* result;
  if (cd == (iconv_t)(-1)) {
    if (errno != EINVAL)
      return -1;
    /* Unsupported fromcode or tocode. Check whether the caller requested
       autodetection. */
    if (!strcmp(fromcode,"autodetect_utf8")) {
      int ret;
      /* Try UTF-8 first. There are very few ISO-8859-1 inputs that would
         be valid UTF-8, but many UTF-8 inputs are valid ISO-8859-1. */
      ret = iconv_string(tocode,"UTF-8",start,end,resultp,lengthp);
      if (!(ret < 0 && errno == EILSEQ))
        return ret;
      ret = iconv_string(tocode,"ISO-8859-1",start,end,resultp,lengthp);
      return ret;
    }
    if (!strcmp(fromcode,"autodetect_jp")) {
      int ret;
      /* Try 7-bit encoding first. If the input contains bytes >= 0x80,
         it will fail. */
      ret = iconv_string(tocode,"ISO-2022-JP-2",start,end,resultp,lengthp);
      if (!(ret < 0 && errno == EILSEQ))
        return ret;
      /* Try EUC-JP next. Short SHIFT_JIS inputs may come out wrong. This
         is unavoidable. People will condemn SHIFT_JIS.
         If we tried SHIFT_JIS first, then some short EUC-JP inputs would
         come out wrong, and people would condemn EUC-JP and Unix, which
         would not be good. */
      ret = iconv_string(tocode,"EUC-JP",start,end,resultp,lengthp);
      if (!(ret < 0 && errno == EILSEQ))
        return ret;
      /* Finally try SHIFT_JIS. */
      ret = iconv_string(tocode,"SHIFT_JIS",start,end,resultp,lengthp);
      return ret;
    }
    if (!strcmp(fromcode,"autodetect_kr")) {
      int ret;
      /* Try 7-bit encoding first. If the input contains bytes >= 0x80,
         it will fail. */
      ret = iconv_string(tocode,"ISO-2022-KR",start,end,resultp,lengthp);
      if (!(ret < 0 && errno == EILSEQ))
        return ret;
      /* Finally try EUC-KR. */
      ret = iconv_string(tocode,"EUC-KR",start,end,resultp,lengthp);
      return ret;
    }
    errno = EINVAL;
    return -1;
  }
  /* Determine the length we need. */
  {
    size_t count = 0;
    char tmpbuf[tmpbufsize];
    const char* inptr = start;
    size_t insize = end-start;
    while (insize > 0) {
      char* outptr = tmpbuf;
      size_t outsize = tmpbufsize;
      size_t res = iconv(cd,&inptr,&insize,&outptr,&outsize);
      if (res == (size_t)(-1)) {
        if (errno == EINVAL)
          break;
        else {
          int saved_errno = errno;
          iconv_close(cd);
          errno = saved_errno;
          return -1;
        }
      }
      count += outptr-tmpbuf;
    }
    {
      char* outptr = tmpbuf;
      size_t outsize = tmpbufsize;
      size_t res = iconv(cd,NULL,NULL,&outptr,&outsize);
      if (res == (size_t)(-1)) {
        int saved_errno = errno;
        iconv_close(cd);
        errno = saved_errno;
        return -1;
      }
      count += outptr-tmpbuf;
    }
    length = count;
  }
  if (lengthp != NULL)
    *lengthp = length;
  if (resultp == NULL) {
    iconv_close(cd);
    return 0;
  }
  result = (*resultp == NULL ? malloc(length) : realloc(*resultp,length));
  *resultp = result;
  if (length == 0) {
    iconv_close(cd);
    return 0;
  }
  if (result == NULL) {
    iconv_close(cd);
    errno = ENOMEM;
    return -1;
  }
  iconv(cd,NULL,NULL,NULL,NULL); /* return to the initial state */
  /* Do the conversion for real. */
  {
    const char* inptr = start;
    size_t insize = end-start;
    char* outptr = result;
    size_t outsize = length;
    while (insize > 0) {
      size_t res = iconv(cd,&inptr,&insize,&outptr,&outsize);
      if (res == (size_t)(-1)) {
        if (errno == EINVAL)
          break;
        else {
          int saved_errno = errno;
          iconv_close(cd);
          errno = saved_errno;
          return -1;
        }
      }
    }
    {
      size_t res = iconv(cd,NULL,NULL,&outptr,&outsize);
      if (res == (size_t)(-1)) {
        int saved_errno = errno;
        iconv_close(cd);
        errno = saved_errno;
        return -1;
      }
    }
    if (outsize != 0) abort();
  }
  iconv_close(cd);
  return 0;
}
예제 #9
0
int main(int argc, char* argv[])
{
	QINIU_ACCESS_KEY = "sn7d6X2kmRQKkNyO0_ZY_Hz2utVrXIeEmc8QutVC";
	QINIU_SECRET_KEY = "jeQSWafTp7kczgR4qVa-erKOaHk0_qcvMNacxO8E";
	curl_global_init(CURL_GLOBAL_ALL);
	Qiniu_Global_Init(-1);                  /* 全局初始化函数,整个进程只需要调用一次 */
	if(strcmp(argv[1],"imgtest") == 0 && argc == 4)
	{
		string oldurl,newurl,articleurl;
		articleurl = argv[2];
		oldurl     = argv[3];
		newurl =  _expandlinks(articleurl,oldurl);
		printf("newurl=%s\n",oldurl.c_str());
	}
	if(strcmp(argv[1],"-start") == 0 && argc == 4)
	{
		int ntime = atoi(argv[2]);
		int nprocesstimes = 1;
		ArticleManage m_article;
		theLog.SetLogFilePath(GetFullPath());
		if(!ReadConfigFile((GetFullPath()+"/sqlconfig.conf").c_str(),p_config))
		{
			return 0;
		}
		if(!ReadSpiderRulerConfigFile((GetFullPath()+"/spiderruler.conf").c_str(),p_spiderruler))
		{
			return 0;
		}
		while(1)
		{
			
			m_article.start(atoi(argv[3]));
			theLog.WriteLog(LOG_LEVEL_SYS,"the proc ArticleExtract %d times completed!",nprocesstimes);
			nprocesstimes++;
			sleep(ntime * 60);
		}
	}
	
	if(argc == 3 && strcmp(argv[1],"-start") != 0 )
	{
		if(!ReadSpiderRulerConfigFile((GetFullPath()+"/spiderruler.conf").c_str(),p_spiderruler))
		{
			return 0;
		}
		string page;
		list<string> ImgStrList;
		page.clear();
		const char* url = argv[1];
		string sourcepagename = argv[2];
		//const char* url = "http://admin.wechat.com/mp/appmsg/show?__biz=MjM5MTIwODcxNA==&appmsgid=10001872&itemidx=1&sign=d5997fecd12a3af79f8c8d65600f82a1";
		printf("url=%s\n",url);
		string urlstr = url;
		if(0 == sourcepagename.compare("zatu")) 
		{
			string strtmp = "";
			iconv_string("utf-8","gbk", urlstr.c_str(), urlstr.length(),strtmp,1);
			urlstr = strtmp;
			printf("urlstr=%s\n",urlstr.c_str());
		}
		
		int method = 0;
		if(!getPage(urlstr.c_str(), method,page))
		{
			printf("不能获取URL内容\n");
			return 0;
		}
		string content;
		string Introduction;
		string publishtime;
		string titlestr;
		string contentimg;
		page = mainpagetagclean(page);
		if(page.length() < 2048)
		{
			printf("the page source length too short ! \n");
			return 0;
		}
		printf("page length=%d\n",page.length());
		
		HtmlExtract sorceExtract(page,p_spiderruler[sourcepagename]);
		sorceExtract.Extract();
		titlestr	= sorceExtract.GetTitle();
		boost::regex title_reg("((?i)(&nbsp;))");
		titlestr = boost::regex_replace(titlestr,title_reg,"");
		printf("title=%s\n",titlestr.c_str());
		content			= sorceExtract.ArticleContent;
		publishtime		= sorceExtract.GetDateTime();
		Introduction	= sorceExtract.Introduction;
		strltrim(publishtime);
		publishtime		= publish_time_deal(publishtime);
		if(0 == sourcepagename.compare("geekpark"))  //极客公园特殊处理
		{
			//boost::regex reg14("((?i)(<div\\s{1,4}class=\"yarpp-related\".*?</div>))");
			boost::regex reg14("((?i)(<div\\s{1,4}(id=\"tags\").*?</div>))");
			content = boost::regex_replace(content,reg14,"");
		}
		if(0 == sourcepagename.compare("cuntuba"))  //苹果网特殊处理
		{
			//boost::regex reg14("((?i)(<div\\s{1,4}class=\"yarpp-related\".*?</div>))");
			boost::regex reg14("((?i)(<div\\s{1,4}(?!class=\"cont\").*?</div>))");
			content = boost::regex_replace(content,reg14,"");
		}
		if(0 == sourcepagename.compare("macx"))  //苹果网特殊处理
		{
			//boost::regex reg14("((?i)(<div\\s{1,4}class=\"yarpp-related\".*?</div>))");
			boost::regex reg14("((?i)(<div\\s{1,4}(?!class=\"v2-t_fsz\").*?</div>))");
			content = boost::regex_replace(content,reg14,"");
			reg14.assign("(?i)(<a[^>]*>.*?</a>)");
			content = boost::regex_replace(content,reg14,"");
		}
		if(0 == sourcepagename.compare("leiphone"))  //雷锋网尾部特殊处理
		{
			//boost::regex reg14("((?i)(<div\\s{1,4}class=\"yarpp-related\".*?</div>))");
			boost::regex reg14("((?i)(<div\\s{1,4}(?!class=\"post_content\").*?</div>))");
			content = boost::regex_replace(content,reg14,"");
			reg14.assign("(?i)(<div>.*?</div>)");
			content = boost::regex_replace(content,reg14,"");
		}
		if(0 == sourcepagename.compare("tech163"))  //网易科技尾部特殊处理
		{
			//boost::regex reg14("((?i)(<div\\s{1,4}class=\"yarpp-related\".*?</div>))");
			boost::regex reg14("((?i)(<div\\s{1,4}(?!id=\"endtext\").*?</div>))");
			content = boost::regex_replace(content,reg14,"");
		}
		if(0 == sourcepagename.compare("pingwest"))  //pingwest尾部特殊处理
		{
			//boost::regex reg14("((?i)(<div\\s{1,4}class=\"yarpp-related\".*?</div>))");
			boost::regex reg15("((?i)(<div.*?>[^<]+</div>))");
			content = boost::regex_replace(content,reg15,"");
		}
		if(0 == sourcepagename.compare("zatu"))  //杂图天下尾部特殊处理
		{
			//boost::regex reg14("((?i)(<div\\s{1,4}class=\"yarpp-related\".*?</div>))");
			boost::regex reg14("((?i)(<div\\s{1,4}(?!class=\"format_text entry-content\").*?</div>))");
			content = boost::regex_replace(content,reg14,"");
		}
		if(0 == sourcepagename.compare("jandan"))
		{
			boost::regex reg11("((?i)(<a[^>]*>.*?</a>))");
			content = boost::regex_replace(content,reg11,"");
			boost::regex reg12("((?i)(<span[^>]*>.*?</span>))");
			content = boost::regex_replace(content,reg12,"");
		}
		if(0 == sourcepagename.compare("guaixun"))
		{
			boost::regex reg13("((?i)(<div style=\"position:absolute.*?</div>))");
			content = boost::regex_replace(content,reg13,"");
		}
		
		content = maincontenttagclean(content);	
		strltrim(content);
		boost::smatch m;
		boost::regex reg8;
		if(sourcepagename == "sinablogit")
		{
			reg8.assign("(?i)( src\\s{0,2}=\\s{0,2}\"([^\"]*)\")");
			content = boost::regex_replace(content,reg8,"");
			reg8.assign("(?i)( real_src =)");
			content = boost::regex_replace(content,reg8," src=");
		}
		else if(sourcepagename == "aqee")
		{
			reg8.assign("(?i)( src\\s{0,2}=\\s{0,2}\"([^\"]*)\")");
			content = boost::regex_replace(content,reg8,"");
			reg8.assign("(?i)( data-original=)");
			content = boost::regex_replace(content,reg8," src=");
		}
		else if(sourcepagename == "macx")
		{
			reg8.assign("(?i)( src\\s{0,2}=\\s{0,2}\"([^\"]*)\")");
			content = boost::regex_replace(content,reg8,"");
			reg8.assign("(?i)( zoomfile=)");
			content = boost::regex_replace(content,reg8," src=");
		}
		if(0 == sourcepagename.compare("cuntuba"))  //寸土吧特殊处理
		{
			reg8.assign("(?i)(\')");
			content = boost::regex_replace(content,reg8,"\"");
		}
		reg8.assign("(?i)( src\\s{0,2}=\\s{0,2}\"([^\"]*)\")");
		
		contentimg = content;
		boost::regex reg("((?i)<((?!img)[^>]*>))");
		contentimg = boost::regex_replace(contentimg,reg,"");

		//boost::regex reg8("(?i)(src=\"([^\"]*)\")");
		std::string::const_iterator start = contentimg.begin();
		std::string::const_iterator end = contentimg.end();
		try
		{
			while(boost::regex_search(start,end,m,reg8))
			{
				if (m[0].matched)
				{
					string tempurl(m[0].first,m[0].second);
					string regurl;
					tempurl = tempurl.substr(tempurl.find_first_of('\"',0)+1,tempurl.find_last_of('\"')-tempurl.find_first_of('\"',0)-1);
					//if( 0 == sourcepagename.compare("zatu") )
				//	{
						regurl =  _expandlinks(urlstr,tempurl);
			//		}
					if(0 != tempurl.length())
					{
						boost::regex reg(tempurl);
						content = boost::regex_replace(content,reg,regurl);
						ImgStrList.push_back(regurl);
					}
					start = m[0].second;
				}
			} 
		}
		catch (const boost::bad_expression& e)
		{
			theLog.WriteLog(LOG_LEVEL_ERROR,"cann't create regex with %s!",urlstr.c_str());
		}
		list<string>::iterator it;
		for( it = ImgStrList.begin(); it != ImgStrList.end(); it++)
		{
			printf("%s\n",(*it).c_str());
		}
		if( 0 == Introduction.length() )
		{
			boost::smatch m1;
			boost::regex reg("(?i)(<p>.*?</p>)");
			std::string::const_iterator start = content.begin();
			std::string::const_iterator end = content.end();
			while(boost::regex_search(start,end,m1,reg))
			{
				if (m1[0].matched)
				{
					Introduction = m1[0].str();
				}
				break;
			}
				
			int pos;
			if( 0 == Introduction.length() )
			{
				if(-1 != (pos = content.find_first_of("\x0d\x0a",0)))
				{
					Introduction = content.substr(0,pos);
				}
			}
		}
		Introduction = Introductioncontenttagclean(Introduction);
		strltrim(Introduction);
		if( 0 == p_spiderruler[sourcepagename].summaryisinbody.compare("yes") )
		{
			content = Introduction + content;
		}
		printf("publishtime=%s\n",publishtime.c_str());
		printf("Introduction=%s\n",Introduction.c_str());
		printf("content=%s\n",content.c_str());
	}
	Qiniu_Global_Cleanup();                 /* 全局清理函数,只需要在进程退出时调用一次 */
	return 0;


}