void tidyhtml::parse(std::string x, std::string path) { tidySetErrorBuffer(tdoc, NULL); tidyParseString(tdoc, x.c_str()); tidyCleanAndRepair( tdoc ); tidyRunDiagnostics( tdoc ); tidySaveFile(tdoc, path.c_str()); }
int main(int argc, char *argv[]) { errno = 0; //CL args const char *dbfile = NULL, *stn_departure = NULL, *stn_arrival = NULL; int ch,res, consecutive_success = 0; //Handles CURL *curl_hdl = NULL; sqlite3 *db_hdl = NULL; TidyDoc tdoc = NULL; //Query initialisers char *link, *new_link; struct tm tm_dep; time_t last_time_dep = 0; int requery = 0; //Parse result holders struct train_list_t *trains = NULL; size_t n, ntrains; //Output char str_time_dep[20]; size_t total = 0; //Parse cmdline while( (ch = getopt(argc, argv, "d:f:t:")) != -1 ) { debug("ch = %d", ch); switch(ch) { case 'd': debug("d %s", optarg); dbfile = optarg; break; case 'f': debug("f %s", optarg); stn_departure = optarg; break; case 't': debug("t %s", optarg); stn_arrival = optarg; break; case '?': if(optopt=='d' || optopt=='f' || optopt=='t') { log_info("Missing argument for option -%c", optopt); goto usage; } else if(isprint(optopt)) { log_info("Unknown option '-%c'", optopt); } else { log_info("Unknown option character '\\x%x'", optopt); } break; default: debug("err got c=%d (opterr=%d, optopt=%c, optind=%d, optarg=%s)", ch, opterr, optopt, optind, optarg); goto usage; } } if(!dbfile || !stn_departure || !stn_arrival) goto usage; debug("Starting %s with dbfile='%s', stn_dep='%s', stn_arr='%s'", argv[0], dbfile, stn_departure, stn_arrival); //Set up Curl check(curl_tidy_init(&curl_hdl)==0,"Failed to initialise curl"); debug("curl_hdl %p", curl_hdl); //Set up database and get names res = database_init(&db_hdl, dbfile); check(res==0, "Failed to open database"); //Send search query time_t now = time(NULL); localtime_r(&now, &tm_dep); tm_dep.tm_hour++; res = sncf_post_form(curl_hdl, &tdoc, &link, &tm_dep, stn_departure, stn_arrival); check(res==0, "Failed to perform query"); debug("Initialized (%d) - link = %s", res, link); //Fetch, parse, print while(1) { debug("Next link %s", link); tidyRelease(tdoc); res = curl_tidy_get(curl_hdl, link, &tdoc); check(res == 0, "failed to fetch results page"); res = sncf_find_next_results_link(tdoc, &new_link); check(res == 0, "failed to get link to next results"); /* * An error in the SNCF site results in occasionally being * sent to the same results page. This means getting stuck * in a loop. If that happens, a workaround is to start a * new query and continue from there */ if(!strcmp(link, new_link)) { log_info("Next results page is the same as the current one (%d successes)", consecutive_success); if(consecutive_success <= 2) { log_info("less than 3 success before loop, this is the end"); break; } requery = 1; } if(requery) { if(requery == 1) log_info("requerying cos of link loop"); if(requery == 2) log_info("requerying cos of time travel"); requery = 0; localtime_r(&last_time_dep, &tm_dep); consecutive_success = 0; tidyRelease(tdoc); free(link); free(new_link); //FIXME: change tm_dep so the SNCF site is likely to handle it res = sncf_post_form(curl_hdl, &tdoc, &link, &tm_dep, stn_departure, stn_arrival); check(res==0, "Failed to perform query"); continue; } if(trains) { debug("last time dep = %lu - train time dep = %lu", last_time_dep, get_last_train(trains)->train.time_departure); } free_trains(trains); trains = NULL; ntrains = sncf_parse_results(db_hdl, tdoc, &trains); debug("found %lu trains", ntrains); //Check if we're getting the same results over and over again (only iff we have results (ntrains) // and only if last_time_dep was set before (check if it's not 0 as initialized)) if(last_time_dep && ntrains && get_last_train(trains)->train.time_departure < last_time_dep) { requery = 2; continue; } if(last_time_dep && ntrains && get_last_train(trains)->train.time_departure == last_time_dep) { log_info("Got the exact same results twice, finishing up"); break; } if(ntrains) { last_time_dep = get_last_train(trains)->train.time_departure; } else { log_info("No trains found, this is the end"); break; } n = train_store(db_hdl, trains); if(n!=ntrains) { log_info("only stored %lu out of %lu trains, aborting", n, ntrains); goto error; } debug("Stored all %lu trains", n); total+=n; #ifdef NDEBUG localtime_r(&get_last_train(trains)->train.time_departure, &tm_dep); strftime(str_time_dep, 20, "%e-%b-%Y %R", &tm_dep); printf("Processed %6lu trains - Last one departed at %s\r", total, str_time_dep); fflush(stdout); #else print_trains(db_hdl, trains, 0); #endif consecutive_success++; free(link); link = new_link; } free(link); free(new_link); error: if(tdoc) { tidySaveFile(tdoc, "dumpfile-exit.html"); tidyRelease(tdoc); } curl_tidy_cleanup(curl_hdl); database_cleanup(db_hdl); localtime_r(&get_last_train(trains)->train.time_departure, &tm_dep); strftime(str_time_dep, 20, "%e-%b-%Y %R", &tm_dep); log_info("Exiting after storing %lu trains (last one arriving %s)", total, str_time_dep); return 0; usage: printf( "Usage : %s -d <dbfile> -f <stn_dep> -t <stn_arr>\n" "\n" "\t<dbfile>\tThe sqlite3 database filename\n" "\t<stn_dep>\tThe departure station\n" "\t<stn_arr>\tThe arrival station\n" "\n", argv[0]); return 0; }
int main( int argc, char** argv ) { ctmbstr prog = argv[0]; ctmbstr cfgfil = NULL, errfil = NULL, htmlfil = NULL; TidyDoc tdoc = tidyCreate(); int status = 0; uint contentErrors = 0; uint contentWarnings = 0; uint accessWarnings = 0; errout = stderr; /* initialize to stderr */ status = 0; #ifdef TIDY_CONFIG_FILE if ( tidyFileExists( tdoc, TIDY_CONFIG_FILE) ) { status = tidyLoadConfig( tdoc, TIDY_CONFIG_FILE ); if ( status != 0 ) fprintf(errout, "Loading config file \"%s\" failed, err = %d\n", TIDY_CONFIG_FILE, status); } #endif /* TIDY_CONFIG_FILE */ /* look for env var "HTML_TIDY" */ /* then for ~/.tidyrc (on platforms defining $HOME) */ if ( (cfgfil = getenv("HTML_TIDY")) != NULL ) { status = tidyLoadConfig( tdoc, cfgfil ); if ( status != 0 ) fprintf(errout, "Loading config file \"%s\" failed, err = %d\n", cfgfil, status); } #ifdef TIDY_USER_CONFIG_FILE else if ( tidyFileExists( tdoc, TIDY_USER_CONFIG_FILE) ) { status = tidyLoadConfig( tdoc, TIDY_USER_CONFIG_FILE ); if ( status != 0 ) fprintf(errout, "Loading config file \"%s\" failed, err = %d\n", TIDY_USER_CONFIG_FILE, status); } #endif /* TIDY_USER_CONFIG_FILE */ /* read command line */ while ( argc > 0 ) { if (argc > 1 && argv[1][0] == '-') { /* support -foo and --foo */ ctmbstr arg = argv[1] + 1; if ( strcasecmp(arg, "xml") == 0) tidyOptSetBool( tdoc, TidyXmlTags, yes ); else if ( strcasecmp(arg, "asxml") == 0 || strcasecmp(arg, "asxhtml") == 0 ) { tidyOptSetBool( tdoc, TidyXhtmlOut, yes ); } else if ( strcasecmp(arg, "ashtml") == 0 ) tidyOptSetBool( tdoc, TidyHtmlOut, yes ); else if ( strcasecmp(arg, "indent") == 0 ) { tidyOptSetInt( tdoc, TidyIndentContent, TidyAutoState ); if ( tidyOptGetInt(tdoc, TidyIndentSpaces) == 0 ) tidyOptResetToDefault( tdoc, TidyIndentSpaces ); } else if ( strcasecmp(arg, "omit") == 0 ) tidyOptSetBool( tdoc, TidyHideEndTags, yes ); else if ( strcasecmp(arg, "upper") == 0 ) tidyOptSetBool( tdoc, TidyUpperCaseTags, yes ); else if ( strcasecmp(arg, "clean") == 0 ) tidyOptSetBool( tdoc, TidyMakeClean, yes ); else if ( strcasecmp(arg, "bare") == 0 ) tidyOptSetBool( tdoc, TidyMakeBare, yes ); else if ( strcasecmp(arg, "raw") == 0 || strcasecmp(arg, "ascii") == 0 || strcasecmp(arg, "latin0") == 0 || strcasecmp(arg, "latin1") == 0 || strcasecmp(arg, "utf8") == 0 || #ifndef NO_NATIVE_ISO2022_SUPPORT strcasecmp(arg, "iso2022") == 0 || #endif #if SUPPORT_UTF16_ENCODINGS strcasecmp(arg, "utf16le") == 0 || strcasecmp(arg, "utf16be") == 0 || strcasecmp(arg, "utf16") == 0 || #endif #if SUPPORT_ASIAN_ENCODINGS strcasecmp(arg, "shiftjis") == 0 || strcasecmp(arg, "big5") == 0 || #endif strcasecmp(arg, "mac") == 0 || strcasecmp(arg, "win1252") == 0 || strcasecmp(arg, "ibm858") == 0 ) { tidySetCharEncoding( tdoc, arg ); } else if ( strcasecmp(arg, "numeric") == 0 ) tidyOptSetBool( tdoc, TidyNumEntities, yes ); else if ( strcasecmp(arg, "modify") == 0 || strcasecmp(arg, "change") == 0 || /* obsolete */ strcasecmp(arg, "update") == 0 ) /* obsolete */ { tidyOptSetBool( tdoc, TidyWriteBack, yes ); } else if ( strcasecmp(arg, "errors") == 0 ) tidyOptSetBool( tdoc, TidyShowMarkup, no ); else if ( strcasecmp(arg, "quiet") == 0 ) tidyOptSetBool( tdoc, TidyQuiet, yes ); else if ( strcasecmp(arg, "help") == 0 || strcasecmp(arg, "h") == 0 || *arg == '?' ) { help( prog ); tidyRelease( tdoc ); return 0; /* success */ } else if ( strcasecmp(arg, "xml-help") == 0) { xml_help( ); tidyRelease( tdoc ); return 0; /* success */ } else if ( strcasecmp(arg, "help-config") == 0 ) { optionhelp( tdoc ); tidyRelease( tdoc ); return 0; /* success */ } else if ( strcasecmp(arg, "xml-config") == 0 ) { XMLoptionhelp( tdoc ); tidyRelease( tdoc ); return 0; /* success */ } else if ( strcasecmp(arg, "show-config") == 0 ) { optionvalues( tdoc ); tidyRelease( tdoc ); return 0; /* success */ } else if ( strcasecmp(arg, "config") == 0 ) { if ( argc >= 3 ) { ctmbstr post; tidyLoadConfig( tdoc, argv[2] ); /* Set new error output stream if setting changed */ post = tidyOptGetValue( tdoc, TidyErrFile ); if ( post && (!errfil || !samefile(errfil, post)) ) { errfil = post; errout = tidySetErrorFile( tdoc, post ); } --argc; ++argv; } } #if SUPPORT_ASIAN_ENCODINGS else if ( strcasecmp(arg, "language") == 0 || strcasecmp(arg, "lang") == 0 ) { if ( argc >= 3 ) { tidyOptSetValue( tdoc, TidyLanguage, argv[2] ); --argc; ++argv; } } #endif else if ( strcasecmp(arg, "output") == 0 || strcasecmp(arg, "-output-file") == 0 || strcasecmp(arg, "o") == 0 ) { if ( argc >= 3 ) { tidyOptSetValue( tdoc, TidyOutFile, argv[2] ); --argc; ++argv; } } else if ( strcasecmp(arg, "file") == 0 || strcasecmp(arg, "-file") == 0 || strcasecmp(arg, "f") == 0 ) { if ( argc >= 3 ) { errfil = argv[2]; errout = tidySetErrorFile( tdoc, errfil ); --argc; ++argv; } } else if ( strcasecmp(arg, "wrap") == 0 || strcasecmp(arg, "-wrap") == 0 || strcasecmp(arg, "w") == 0 ) { if ( argc >= 3 ) { uint wraplen = 0; int nfields = sscanf( argv[2], "%u", &wraplen ); tidyOptSetInt( tdoc, TidyWrapLen, wraplen ); if (nfields > 0) { --argc; ++argv; } } } else if ( strcasecmp(arg, "version") == 0 || strcasecmp(arg, "-version") == 0 || strcasecmp(arg, "v") == 0 ) { version(); tidyRelease( tdoc ); return 0; /* success */ } else if ( strncmp(argv[1], "--", 2 ) == 0) { if ( tidyOptParseValue(tdoc, argv[1]+2, argv[2]) ) { /* Set new error output stream if setting changed */ ctmbstr post = tidyOptGetValue( tdoc, TidyErrFile ); if ( post && (!errfil || !samefile(errfil, post)) ) { errfil = post; errout = tidySetErrorFile( tdoc, post ); } ++argv; --argc; } } #if SUPPORT_ACCESSIBILITY_CHECKS else if ( strcasecmp(arg, "access") == 0 ) { if ( argc >= 3 ) { uint acclvl = 0; int nfields = sscanf( argv[2], "%u", &acclvl ); tidyOptSetInt( tdoc, TidyAccessibilityCheckLevel, acclvl ); if (nfields > 0) { --argc; ++argv; } } } #endif else { uint c; ctmbstr s = argv[1]; while ( (c = *++s) != '\0' ) { switch ( c ) { case 'i': tidyOptSetInt( tdoc, TidyIndentContent, TidyAutoState ); if ( tidyOptGetInt(tdoc, TidyIndentSpaces) == 0 ) tidyOptResetToDefault( tdoc, TidyIndentSpaces ); break; /* Usurp -o for output file. Anyone hiding end tags? case 'o': tidyOptSetBool( tdoc, TidyHideEndTags, yes ); break; */ case 'u': tidyOptSetBool( tdoc, TidyUpperCaseTags, yes ); break; case 'c': tidyOptSetBool( tdoc, TidyMakeClean, yes ); break; case 'b': tidyOptSetBool( tdoc, TidyMakeBare, yes ); break; case 'n': tidyOptSetBool( tdoc, TidyNumEntities, yes ); break; case 'm': tidyOptSetBool( tdoc, TidyWriteBack, yes ); break; case 'e': tidyOptSetBool( tdoc, TidyShowMarkup, no ); break; case 'q': tidyOptSetBool( tdoc, TidyQuiet, yes ); break; default: unknownOption( c ); break; } } } --argc; ++argv; continue; } if ( argc > 1 ) { htmlfil = argv[1]; if ( tidyOptGetBool(tdoc, TidyEmacs) ) tidyOptSetValue( tdoc, TidyEmacsFile, htmlfil ); status = tidyParseFile( tdoc, htmlfil ); } else { htmlfil = "stdin"; status = tidyParseStdin( tdoc ); } if ( status >= 0 ) status = tidyCleanAndRepair( tdoc ); if ( status >= 0 ) status = tidyRunDiagnostics( tdoc ); if ( status > 1 ) /* If errors, do we want to force output? */ status = ( tidyOptGetBool(tdoc, TidyForceOutput) ? status : -1 ); if ( status >= 0 && tidyOptGetBool(tdoc, TidyShowMarkup) ) { if ( tidyOptGetBool(tdoc, TidyWriteBack) && argc > 1 ) status = tidySaveFile( tdoc, htmlfil ); else { ctmbstr outfil = tidyOptGetValue( tdoc, TidyOutFile ); if ( outfil ) status = tidySaveFile( tdoc, outfil ); else status = tidySaveStdout( tdoc ); } } contentErrors += tidyErrorCount( tdoc ); contentWarnings += tidyWarningCount( tdoc ); accessWarnings += tidyAccessWarningCount( tdoc ); --argc; ++argv; if ( argc <= 1 ) break; } if (!tidyOptGetBool(tdoc, TidyQuiet) && errout == stderr && !contentErrors) fprintf(errout, "\n"); if (contentErrors + contentWarnings > 0 && !tidyOptGetBool(tdoc, TidyQuiet)) tidyErrorSummary(tdoc); if (!tidyOptGetBool(tdoc, TidyQuiet)) tidyGeneralInfo(tdoc); /* called to free hash tables etc. */ tidyRelease( tdoc ); /* return status can be used by scripts */ if ( contentErrors > 0 ) return 2; if ( contentWarnings > 0 ) return 1; /* 0 signifies all is ok */ return 0; }