//编码问题,如果是gbk,则转成utf8 string Doc::charset_convert() { string pattern = "charset\\s?=\\s?(.*?)\""; Regex *regex = new Regex(pattern); string charset = regex->match_one(content, 1); if (charset == "utf-8" || charset == "utf8") //如果文档是utf8编码,则不做特别处理 return content; else if (charset == "gbk" || charset == "gb2312") //如果是gbk编码,则转换成utf8编码 { Convert *con = new Convert("gbk", "utf8"); return con->exec(content); } else //其他编码不予以考虑,直接跳过 return ""; }