char *strtrim(char *s) { strrtrim(s); strltrim(s); return s; }
/* * ParseDirective * Note: macro directives are handled before recording */ void ParseDirective(const char *cline) { char *line = strdup(strltrim(cline) + 1); // skip '.' and allow strtok //printf("'%s'\n", file->line); //printf("'%s'\n", line); bool valid_directive = false; //bool done_directive = true; //printf("Line: [%s]\n", cline); /* * macro ending directives */ if (DIRECTIVE("endm", PARSE_MACRO_DIRECTIVES)) { current_macro = 0; parse_directives = PARSE_ALL_DIRECTIVES; } else if (DIRECTIVE("endr", PARSE_REPT_DIRECTIVES)) { current_macro = 0; parse_directives = PARSE_ALL_DIRECTIVES; while (repeat-- > 0) MacroExecute("_rept"); } /* * record to macro */ if (current_macro) { MacroLine(cline); // record full line goto exit; // only process macro ending directives } /* * macro starting directives */ if (DIRECTIVE("macro", PARSE_MACRO_DIRECTIVES) || DIRECTIVE("macroicase", PARSE_MACRO_DIRECTIVES)) { char *name = strtok((char *)strskipspace(line), delim_chars); current_macro = FindMacro(name); if (pass != PASS_ASM) { if (current_macro) { eprintf("Macro name already defined.\n"); eexit(); } current_macro = NewMacro(name, DIRECTIVE("macroicase", PARSE_MACRO_DIRECTIVES)); EEKS{printf("new macro at %p\n", current_macro);} char *paramname; while ((paramname = strtok(0, delim_chars))) { if (isspace2(paramname[0])) paramname = strskipspace(paramname); if (strchr(endline_chars, *paramname)) break; current_macro->AddParameter(paramname); } } parse_directives = PARSE_MACRO_DIRECTIVES; }
/** * Return the next (non-commented) line from the host-file. * Format is: * ip-address host-name [alias..] {\n | # ..} */ struct hostent * W32_CALL gethostent (void) { struct _hostent h; char *tok, *ip, *name, *alias; char buf [2*MAX_HOSTLEN]; int i; if (!netdb_init() || !hostFile) { h_errno = NO_RECOVERY; return (NULL); } while (1) { if (!fgets(buf,sizeof(buf),hostFile)) return (NULL); tok = strltrim (buf); if (*tok == '#' || *tok == ';' || *tok == '\n') continue; ip = strtok (tok, " \t"); name = strtok (NULL, " \t\n"); if (ip && name && isaddr(ip)) break; } if (hostClose) endhostent(); memset (&h, 0, sizeof(h)); if (!strcmp(ip,"0.0.0.0")) /* inet_addr() maps 0 -> INADDR_NONE */ h.h_address[0] = INADDR_ANY; else h.h_address[0] = inet_addr (ip); h.h_num_addr = 1; h.h_name = name; alias = strtok (NULL, " \t\n"); for (i = 0; alias && i < MAX_HOST_ALIASES; i++) { static char aliases [MAX_NETENT_ALIASES][MAX_HOSTLEN]; if (*alias == '#' || *alias == ';') break; h.h_aliases[i] = StrLcpy (aliases[i], alias, sizeof(aliases[i])); alias = strtok (NULL, " \t\n"); } return fill_hostent (&h); }
/* * Return the next (non-commented) line from the network-file * Format is: * name [=] net [alias..] {\n | # ..} * * e.g. * loopback 127 * arpanet 10 arpa */ struct netent * W32_CALL getnetent (void) { struct _netent n; char *name, *net, *alias; char buf [2*MAX_NAMELEN], *tok; int i; if (!netdb_init()) return (NULL); while (1) { if (!fgets(buf,sizeof(buf),networkFile)) return (NULL); tok = strltrim (buf); if (*tok == '#' || *tok == ';' || *tok == '\n') continue; name = strtok (tok, " \t"); net = strtok (NULL, "= \t\n"); if (name && net) break; } if (networkClose) endnetent(); memset (&n, 0, sizeof(n)); n.n_net = inet_network (net); n.n_name = name; alias = strtok (NULL, " \t\n"); for (i = 0; alias && i < MAX_NETENT_ALIASES; i++) { static char aliases [MAX_NETENT_ALIASES][MAX_NAMELEN]; if (*alias == '#' || *alias == ';') break; n.n_aliases[i] = StrLcpy (aliases[i], alias, sizeof(aliases[i])); alias = strtok (NULL, " \t\n"); } return fill_netent (&n); }
char *strtrim(char *s) { return strrtrim(strltrim(s)); }
static char *strtrim(char *str) { return strltrim(strrtrim(str)); }
char *CDateTime::strtrim(char *str, const char *trim) { char *buf = strltrim(strrtrim(str, trim), trim); return buf; }
char *strtrim(char *str, const char *trim) { return strltrim(strrtrim(str, trim), trim); }
void test_left_side_trim (void) { CU_ASSERT (strcmp (STR_A_EXPECTED_VALUE_LT, strltrim (str_a)) == 0); CU_ASSERT (strcmp (STR_B_EXPECTED_VALUE_LT, strltrim (str_b)) == 0); CU_ASSERT (strcmp (STR_C_EXPECTED_VALUE_LT, strltrim (str_c)) == 0); CU_ASSERT (strcmp (STR_D_EXPECTED_VALUE_LT, strltrim (str_d)) == 0) }
int main(int argc, char* argv[]) { QINIU_ACCESS_KEY = "sn7d6X2kmRQKkNyO0_ZY_Hz2utVrXIeEmc8QutVC"; QINIU_SECRET_KEY = "jeQSWafTp7kczgR4qVa-erKOaHk0_qcvMNacxO8E"; curl_global_init(CURL_GLOBAL_ALL); Qiniu_Global_Init(-1); /* 全局初始化函数,整个进程只需要调用一次 */ if(strcmp(argv[1],"imgtest") == 0 && argc == 4) { string oldurl,newurl,articleurl; articleurl = argv[2]; oldurl = argv[3]; newurl = _expandlinks(articleurl,oldurl); printf("newurl=%s\n",oldurl.c_str()); } if(strcmp(argv[1],"-start") == 0 && argc == 4) { int ntime = atoi(argv[2]); int nprocesstimes = 1; ArticleManage m_article; theLog.SetLogFilePath(GetFullPath()); if(!ReadConfigFile((GetFullPath()+"/sqlconfig.conf").c_str(),p_config)) { return 0; } if(!ReadSpiderRulerConfigFile((GetFullPath()+"/spiderruler.conf").c_str(),p_spiderruler)) { return 0; } while(1) { m_article.start(atoi(argv[3])); theLog.WriteLog(LOG_LEVEL_SYS,"the proc ArticleExtract %d times completed!",nprocesstimes); nprocesstimes++; sleep(ntime * 60); } } if(argc == 3 && strcmp(argv[1],"-start") != 0 ) { if(!ReadSpiderRulerConfigFile((GetFullPath()+"/spiderruler.conf").c_str(),p_spiderruler)) { return 0; } string page; list<string> ImgStrList; page.clear(); const char* url = argv[1]; string sourcepagename = argv[2]; //const char* url = "http://admin.wechat.com/mp/appmsg/show?__biz=MjM5MTIwODcxNA==&appmsgid=10001872&itemidx=1&sign=d5997fecd12a3af79f8c8d65600f82a1"; printf("url=%s\n",url); string urlstr = url; if(0 == sourcepagename.compare("zatu")) { string strtmp = ""; iconv_string("utf-8","gbk", urlstr.c_str(), urlstr.length(),strtmp,1); urlstr = strtmp; printf("urlstr=%s\n",urlstr.c_str()); } int method = 0; if(!getPage(urlstr.c_str(), method,page)) { printf("不能获取URL内容\n"); return 0; } string content; string Introduction; string publishtime; string titlestr; string contentimg; page = mainpagetagclean(page); if(page.length() < 2048) { printf("the page source length too short ! \n"); return 0; } printf("page length=%d\n",page.length()); HtmlExtract sorceExtract(page,p_spiderruler[sourcepagename]); sorceExtract.Extract(); titlestr = sorceExtract.GetTitle(); boost::regex title_reg("((?i)( ))"); titlestr = boost::regex_replace(titlestr,title_reg,""); printf("title=%s\n",titlestr.c_str()); content = sorceExtract.ArticleContent; publishtime = sorceExtract.GetDateTime(); Introduction = sorceExtract.Introduction; strltrim(publishtime); publishtime = publish_time_deal(publishtime); if(0 == sourcepagename.compare("geekpark")) //极客公园特殊处理 { //boost::regex reg14("((?i)(<div\\s{1,4}class=\"yarpp-related\".*?</div>))"); boost::regex reg14("((?i)(<div\\s{1,4}(id=\"tags\").*?</div>))"); content = boost::regex_replace(content,reg14,""); } if(0 == sourcepagename.compare("cuntuba")) //苹果网特殊处理 { //boost::regex reg14("((?i)(<div\\s{1,4}class=\"yarpp-related\".*?</div>))"); boost::regex reg14("((?i)(<div\\s{1,4}(?!class=\"cont\").*?</div>))"); content = boost::regex_replace(content,reg14,""); } if(0 == sourcepagename.compare("macx")) //苹果网特殊处理 { //boost::regex reg14("((?i)(<div\\s{1,4}class=\"yarpp-related\".*?</div>))"); boost::regex reg14("((?i)(<div\\s{1,4}(?!class=\"v2-t_fsz\").*?</div>))"); content = boost::regex_replace(content,reg14,""); reg14.assign("(?i)(<a[^>]*>.*?</a>)"); content = boost::regex_replace(content,reg14,""); } if(0 == sourcepagename.compare("leiphone")) //雷锋网尾部特殊处理 { //boost::regex reg14("((?i)(<div\\s{1,4}class=\"yarpp-related\".*?</div>))"); boost::regex reg14("((?i)(<div\\s{1,4}(?!class=\"post_content\").*?</div>))"); content = boost::regex_replace(content,reg14,""); reg14.assign("(?i)(<div>.*?</div>)"); content = boost::regex_replace(content,reg14,""); } if(0 == sourcepagename.compare("tech163")) //网易科技尾部特殊处理 { //boost::regex reg14("((?i)(<div\\s{1,4}class=\"yarpp-related\".*?</div>))"); boost::regex reg14("((?i)(<div\\s{1,4}(?!id=\"endtext\").*?</div>))"); content = boost::regex_replace(content,reg14,""); } if(0 == sourcepagename.compare("pingwest")) //pingwest尾部特殊处理 { //boost::regex reg14("((?i)(<div\\s{1,4}class=\"yarpp-related\".*?</div>))"); boost::regex reg15("((?i)(<div.*?>[^<]+</div>))"); content = boost::regex_replace(content,reg15,""); } if(0 == sourcepagename.compare("zatu")) //杂图天下尾部特殊处理 { //boost::regex reg14("((?i)(<div\\s{1,4}class=\"yarpp-related\".*?</div>))"); boost::regex reg14("((?i)(<div\\s{1,4}(?!class=\"format_text entry-content\").*?</div>))"); content = boost::regex_replace(content,reg14,""); } if(0 == sourcepagename.compare("jandan")) { boost::regex reg11("((?i)(<a[^>]*>.*?</a>))"); content = boost::regex_replace(content,reg11,""); boost::regex reg12("((?i)(<span[^>]*>.*?</span>))"); content = boost::regex_replace(content,reg12,""); } if(0 == sourcepagename.compare("guaixun")) { boost::regex reg13("((?i)(<div style=\"position:absolute.*?</div>))"); content = boost::regex_replace(content,reg13,""); } content = maincontenttagclean(content); strltrim(content); boost::smatch m; boost::regex reg8; if(sourcepagename == "sinablogit") { reg8.assign("(?i)( src\\s{0,2}=\\s{0,2}\"([^\"]*)\")"); content = boost::regex_replace(content,reg8,""); reg8.assign("(?i)( real_src =)"); content = boost::regex_replace(content,reg8," src="); } else if(sourcepagename == "aqee") { reg8.assign("(?i)( src\\s{0,2}=\\s{0,2}\"([^\"]*)\")"); content = boost::regex_replace(content,reg8,""); reg8.assign("(?i)( data-original=)"); content = boost::regex_replace(content,reg8," src="); } else if(sourcepagename == "macx") { reg8.assign("(?i)( src\\s{0,2}=\\s{0,2}\"([^\"]*)\")"); content = boost::regex_replace(content,reg8,""); reg8.assign("(?i)( zoomfile=)"); content = boost::regex_replace(content,reg8," src="); } if(0 == sourcepagename.compare("cuntuba")) //寸土吧特殊处理 { reg8.assign("(?i)(\')"); content = boost::regex_replace(content,reg8,"\""); } reg8.assign("(?i)( src\\s{0,2}=\\s{0,2}\"([^\"]*)\")"); contentimg = content; boost::regex reg("((?i)<((?!img)[^>]*>))"); contentimg = boost::regex_replace(contentimg,reg,""); //boost::regex reg8("(?i)(src=\"([^\"]*)\")"); std::string::const_iterator start = contentimg.begin(); std::string::const_iterator end = contentimg.end(); try { while(boost::regex_search(start,end,m,reg8)) { if (m[0].matched) { string tempurl(m[0].first,m[0].second); string regurl; tempurl = tempurl.substr(tempurl.find_first_of('\"',0)+1,tempurl.find_last_of('\"')-tempurl.find_first_of('\"',0)-1); //if( 0 == sourcepagename.compare("zatu") ) // { regurl = _expandlinks(urlstr,tempurl); // } if(0 != tempurl.length()) { boost::regex reg(tempurl); content = boost::regex_replace(content,reg,regurl); ImgStrList.push_back(regurl); } start = m[0].second; } } } catch (const boost::bad_expression& e) { theLog.WriteLog(LOG_LEVEL_ERROR,"cann't create regex with %s!",urlstr.c_str()); } list<string>::iterator it; for( it = ImgStrList.begin(); it != ImgStrList.end(); it++) { printf("%s\n",(*it).c_str()); } if( 0 == Introduction.length() ) { boost::smatch m1; boost::regex reg("(?i)(<p>.*?</p>)"); std::string::const_iterator start = content.begin(); std::string::const_iterator end = content.end(); while(boost::regex_search(start,end,m1,reg)) { if (m1[0].matched) { Introduction = m1[0].str(); } break; } int pos; if( 0 == Introduction.length() ) { if(-1 != (pos = content.find_first_of("\x0d\x0a",0))) { Introduction = content.substr(0,pos); } } } Introduction = Introductioncontenttagclean(Introduction); strltrim(Introduction); if( 0 == p_spiderruler[sourcepagename].summaryisinbody.compare("yes") ) { content = Introduction + content; } printf("publishtime=%s\n",publishtime.c_str()); printf("Introduction=%s\n",Introduction.c_str()); printf("content=%s\n",content.c_str()); } Qiniu_Global_Cleanup(); /* 全局清理函数,只需要在进程退出时调用一次 */ return 0; }
char line_parse (void) { /*---(locals)-----------+-----------+-*/ char rce = -10; int rc = 0; int x_fields = 0; char x_recd [LEN_RECD]; int x_pos = 0; /*---(header)-------------------------*/ DEBUG_INPT yLOG_enter (__FUNCTION__); /*---(prepare)------------------------*/ my.t_ready = '-'; strlcpy (my.t_tracker , "n/a" , LEN_TRACKER); strlcpy (my.t_schedule, "" , LEN_RECD); strlcpy (my.t_flags , "- - - - -", LEN_FLAGS); strlcpy (my.t_command , "" , LEN_COMMAND); /*---(field count)--------------------*/ rc = yPARSE_ready (&x_fields); DEBUG_INPT yLOG_value ("ready" , rc); --rce; if (rc != 'y') { DEBUG_INPT yLOG_exit (__FUNCTION__); return rce; } DEBUG_INPT yLOG_value ("fields" , x_fields); --rce; if (x_fields < 1) { DEBUG_INPT yLOG_exit (__FUNCTION__); return rce; } /*---(adjust original record)---------*/ if (x_fields == 1) { DEBUG_INPT yLOG_note ("found original style record (six space delimited parts)"); rc = yPARSE_popstr (x_recd); DEBUG_INPT yLOG_value ("original" , rc); --rce; if (rc < 0) { DEBUG_INPT yLOG_exitr (__FUNCTION__, rce); return rce; } strltrim (x_recd, ySTR_SINGLE, LEN_RECD); x_fields = strldcnt (x_recd, ' ', LEN_RECD); DEBUG_INPT yLOG_value ("x_fields" , x_fields); if (x_fields < 5) { DEBUG_INPT yLOG_exitr (__FUNCTION__, rce); return rce; } x_pos = strldpos (x_recd, ' ', 5, LEN_RECD); DEBUG_INPT yLOG_value ("x_pos" , x_pos); x_recd [x_pos] = '\0'; strlcpy (my.t_schedule, x_recd, LEN_RECD); DEBUG_INPT yLOG_info ("t_schedule", my.t_schedule); strlcpy (my.t_command , x_recd + x_pos + 1, LEN_COMMAND); DEBUG_INPT yLOG_info ("t_command" , my.t_command); my.t_ready = 'y'; /*---(complete)-----------------------*/ DEBUG_INPT yLOG_exit (__FUNCTION__); return 0; } /*---(schedule string)----------------*/ rc = yPARSE_popstr (my.t_schedule); strltrim (my.t_schedule, ySTR_SINGLE, LEN_RECD); DEBUG_INPT yLOG_value ("schedule" , rc); --rce; if (rc < 0) { DEBUG_INPT yLOG_exitr (__FUNCTION__, rce); return rce; } DEBUG_INPT yLOG_info ("t_schedule", my.t_schedule); /*---(tracker/title)------------------*/ if (x_fields > 2) { rc = yPARSE_popstr (my.t_tracker); DEBUG_INPT yLOG_value ("tracker" , rc); --rce; if (rc < 0) { DEBUG_INPT yLOG_exitr (__FUNCTION__, rce); return rce; } DEBUG_INPT yLOG_info ("tracker" , my.t_tracker); } /*---(run-time flags)-----------------*/ if (x_fields > 3) { rc = yPARSE_popstr (my.t_flags); DEBUG_INPT yLOG_value ("flags" , rc); --rce; if (rc < 0) { DEBUG_INPT yLOG_exitr (__FUNCTION__, rce); return rce; } DEBUG_INPT yLOG_info ("t_flags" , my.t_flags); } /*---(command)------------------------*/ rc = yPARSE_popstr (my.t_command); DEBUG_INPT yLOG_value ("command" , rc); --rce; if (rc < 0) { DEBUG_INPT yLOG_exitr (__FUNCTION__, rce); return rce; } DEBUG_INPT yLOG_info ("command" , my.t_command); /*---(set ready)----------------------*/ my.t_ready = 'y'; /*---(complete)-----------------------*/ DEBUG_INPT yLOG_exit (__FUNCTION__); return 0; }