示例#1
0
void tidyhtml::parse(std::string x, std::string path)
{
        tidySetErrorBuffer(tdoc, NULL);
        tidyParseString(tdoc, x.c_str());
        tidyCleanAndRepair( tdoc );
        tidyRunDiagnostics( tdoc );
        tidySaveFile(tdoc, path.c_str());


}
示例#2
0
int main(int argc, char *argv[])
{
	errno = 0;
	//CL args
	const char *dbfile = NULL, *stn_departure = NULL, *stn_arrival = NULL;
	int ch,res, consecutive_success = 0;

	//Handles
	CURL *curl_hdl = NULL;
	sqlite3 *db_hdl = NULL;
 	TidyDoc tdoc = NULL;

	//Query initialisers
	char *link, *new_link;
	struct tm tm_dep;
	time_t last_time_dep = 0;
	int requery = 0;

	//Parse result holders
	struct train_list_t *trains = NULL;
	size_t n, ntrains;

	//Output
	char str_time_dep[20];
	size_t total = 0;

	//Parse cmdline
	while( (ch = getopt(argc, argv, "d:f:t:")) != -1 ) {
		debug("ch = %d", ch);
		switch(ch) {
		case 'd':
			debug("d %s", optarg);
			dbfile = optarg;
			break;
		case 'f':
			debug("f %s", optarg);
			stn_departure = optarg;
			break;
		case 't':
			debug("t %s", optarg);
			stn_arrival = optarg;
			break;
		case '?':
			if(optopt=='d' || optopt=='f' || optopt=='t') {
				log_info("Missing argument for option -%c", optopt);
				goto usage;
			} else if(isprint(optopt)) {
				log_info("Unknown option '-%c'", optopt);
			} else {
				log_info("Unknown option character '\\x%x'", optopt);
			}
			break;	
		default:
			debug("err got c=%d (opterr=%d, optopt=%c, optind=%d, optarg=%s)", ch, opterr, optopt, optind, optarg);
			goto usage;	
		}
	}
	if(!dbfile || !stn_departure || !stn_arrival) goto usage;
	debug("Starting %s with dbfile='%s', stn_dep='%s', stn_arr='%s'",
		argv[0], dbfile, stn_departure, stn_arrival);

	//Set up Curl 
	check(curl_tidy_init(&curl_hdl)==0,"Failed to initialise curl");
	debug("curl_hdl %p", curl_hdl);

	//Set up database and get names
	res = database_init(&db_hdl, dbfile);
	check(res==0, "Failed to open database"); 

	//Send search query 
	time_t now = time(NULL);
	localtime_r(&now, &tm_dep);
	tm_dep.tm_hour++;
	res = sncf_post_form(curl_hdl, &tdoc, 
		&link, &tm_dep, 
		stn_departure, stn_arrival);
	check(res==0, "Failed to perform query");
	debug("Initialized (%d) - link = %s", res, link);

	//Fetch, parse, print
	while(1) {	
		debug("Next link %s", link);
		tidyRelease(tdoc);
		res = curl_tidy_get(curl_hdl, link, &tdoc);
		check(res ==  0, "failed to fetch results page");

		res = sncf_find_next_results_link(tdoc, &new_link);
		check(res == 0, "failed to get link to next results");

/*
 * An error in the SNCF site results in occasionally being
 * sent to the same results page. This means getting stuck
 * in a loop. If that happens, a workaround is to start a
 * new query and continue from there
 */
		if(!strcmp(link, new_link)) {
			log_info("Next results page is the same as the current one (%d successes)", consecutive_success);
			if(consecutive_success <= 2) {
				log_info("less than 3 success before loop, this is the end");
				break;
			}
			requery = 1;
		}
		if(requery) {
			if(requery == 1) log_info("requerying cos of link loop");
			if(requery == 2) log_info("requerying cos of time travel");
			requery = 0;
			localtime_r(&last_time_dep, &tm_dep);

			consecutive_success = 0;
			tidyRelease(tdoc);

			free(link);
			free(new_link);
			//FIXME: change tm_dep so the SNCF site is likely to handle it
			res = sncf_post_form(curl_hdl, &tdoc, 
				&link, &tm_dep, 
				stn_departure, stn_arrival);
			check(res==0, "Failed to perform query");
			continue;
		}

		if(trains) {
			debug("last time dep = %lu - train time dep = %lu", last_time_dep, get_last_train(trains)->train.time_departure);
		}

		free_trains(trains);
		trains = NULL;
		ntrains = sncf_parse_results(db_hdl, tdoc, &trains);
		debug("found %lu trains", ntrains);	

		//Check if we're getting the same results over and over again (only iff we have results (ntrains)
		// and only if last_time_dep was set before (check if it's not 0 as initialized))
		if(last_time_dep && ntrains && get_last_train(trains)->train.time_departure < last_time_dep) {
			requery = 2;
			continue;	
		}
		if(last_time_dep && ntrains && get_last_train(trains)->train.time_departure == last_time_dep) {
			log_info("Got the exact same results twice, finishing up");
			break;
		}
		if(ntrains) {
			last_time_dep = get_last_train(trains)->train.time_departure;
		} else {
			log_info("No trains found, this is the end");
			break;
		}

		n = train_store(db_hdl, trains);	
		if(n!=ntrains) {
			log_info("only stored %lu out of %lu trains, aborting", n, ntrains);
			goto error;
		}
		debug("Stored all %lu trains", n);
		total+=n;

#ifdef NDEBUG
		localtime_r(&get_last_train(trains)->train.time_departure, &tm_dep);
		strftime(str_time_dep, 20, "%e-%b-%Y %R", &tm_dep);
		printf("Processed %6lu trains - Last one departed at %s\r",
			total, str_time_dep); 		
		fflush(stdout);
#else
		print_trains(db_hdl, trains, 0);
#endif
		consecutive_success++; 
		free(link);
		link = new_link;
	}
	free(link);
	free(new_link);

error:
	if(tdoc) {
		tidySaveFile(tdoc, "dumpfile-exit.html");
		tidyRelease(tdoc);
	}
	curl_tidy_cleanup(curl_hdl);
	database_cleanup(db_hdl);

	localtime_r(&get_last_train(trains)->train.time_departure, &tm_dep);
	strftime(str_time_dep, 20, "%e-%b-%Y %R", &tm_dep);
	log_info("Exiting after storing %lu trains (last one arriving %s)", total, str_time_dep);
	return 0;

usage:
	printf(
		"Usage : %s -d <dbfile> -f <stn_dep> -t <stn_arr>\n"
		"\n"
		"\t<dbfile>\tThe sqlite3 database filename\n"
		"\t<stn_dep>\tThe departure station\n"
		"\t<stn_arr>\tThe arrival station\n"
		"\n",
	argv[0]);
	return 0;
}
示例#3
0
int main( int argc, char** argv )
{
    ctmbstr prog = argv[0];
    ctmbstr cfgfil = NULL, errfil = NULL, htmlfil = NULL;
    TidyDoc tdoc = tidyCreate();
    int status = 0;

    uint contentErrors = 0;
    uint contentWarnings = 0;
    uint accessWarnings = 0;

    errout = stderr;  /* initialize to stderr */
    status = 0;
    
#ifdef TIDY_CONFIG_FILE
    if ( tidyFileExists( tdoc, TIDY_CONFIG_FILE) )
    {
        status = tidyLoadConfig( tdoc, TIDY_CONFIG_FILE );
        if ( status != 0 )
            fprintf(errout, "Loading config file \"%s\" failed, err = %d\n", TIDY_CONFIG_FILE, status);
    }
#endif /* TIDY_CONFIG_FILE */

    /* look for env var "HTML_TIDY" */
    /* then for ~/.tidyrc (on platforms defining $HOME) */

    if ( (cfgfil = getenv("HTML_TIDY")) != NULL )
    {
        status = tidyLoadConfig( tdoc, cfgfil );
        if ( status != 0 )
            fprintf(errout, "Loading config file \"%s\" failed, err = %d\n", cfgfil, status);
    }
#ifdef TIDY_USER_CONFIG_FILE
    else if ( tidyFileExists( tdoc, TIDY_USER_CONFIG_FILE) )
    {
        status = tidyLoadConfig( tdoc, TIDY_USER_CONFIG_FILE );
        if ( status != 0 )
            fprintf(errout, "Loading config file \"%s\" failed, err = %d\n", TIDY_USER_CONFIG_FILE, status);
    }
#endif /* TIDY_USER_CONFIG_FILE */

    /* read command line */
    while ( argc > 0 )
    {
        if (argc > 1 && argv[1][0] == '-')
        {
            /* support -foo and --foo */
            ctmbstr arg = argv[1] + 1;

            if ( strcasecmp(arg, "xml") == 0)
                tidyOptSetBool( tdoc, TidyXmlTags, yes );

            else if ( strcasecmp(arg,   "asxml") == 0 ||
                      strcasecmp(arg, "asxhtml") == 0 )
            {
                tidyOptSetBool( tdoc, TidyXhtmlOut, yes );
            }
            else if ( strcasecmp(arg,   "ashtml") == 0 )
                tidyOptSetBool( tdoc, TidyHtmlOut, yes );

            else if ( strcasecmp(arg, "indent") == 0 )
            {
                tidyOptSetInt( tdoc, TidyIndentContent, TidyAutoState );
                if ( tidyOptGetInt(tdoc, TidyIndentSpaces) == 0 )
                    tidyOptResetToDefault( tdoc, TidyIndentSpaces );
            }
            else if ( strcasecmp(arg, "omit") == 0 )
                tidyOptSetBool( tdoc, TidyHideEndTags, yes );

            else if ( strcasecmp(arg, "upper") == 0 )
                tidyOptSetBool( tdoc, TidyUpperCaseTags, yes );

            else if ( strcasecmp(arg, "clean") == 0 )
                tidyOptSetBool( tdoc, TidyMakeClean, yes );

            else if ( strcasecmp(arg, "bare") == 0 )
                tidyOptSetBool( tdoc, TidyMakeBare, yes );

            else if ( strcasecmp(arg, "raw") == 0      ||
                      strcasecmp(arg, "ascii") == 0    ||
                      strcasecmp(arg, "latin0") == 0   ||
                      strcasecmp(arg, "latin1") == 0   ||
                      strcasecmp(arg, "utf8") == 0     ||
#ifndef NO_NATIVE_ISO2022_SUPPORT
                      strcasecmp(arg, "iso2022") == 0  ||
#endif
#if SUPPORT_UTF16_ENCODINGS
                      strcasecmp(arg, "utf16le") == 0  ||
                      strcasecmp(arg, "utf16be") == 0  ||
                      strcasecmp(arg, "utf16") == 0    ||
#endif
#if SUPPORT_ASIAN_ENCODINGS
                      strcasecmp(arg, "shiftjis") == 0 ||
                      strcasecmp(arg, "big5") == 0     ||
#endif
                      strcasecmp(arg, "mac") == 0      ||
                      strcasecmp(arg, "win1252") == 0  ||
                      strcasecmp(arg, "ibm858") == 0 )
            {
                tidySetCharEncoding( tdoc, arg );
            }
            else if ( strcasecmp(arg, "numeric") == 0 )
                tidyOptSetBool( tdoc, TidyNumEntities, yes );

            else if ( strcasecmp(arg, "modify") == 0 ||
                      strcasecmp(arg, "change") == 0 ||  /* obsolete */
                      strcasecmp(arg, "update") == 0 )   /* obsolete */
            {
                tidyOptSetBool( tdoc, TidyWriteBack, yes );
            }
            else if ( strcasecmp(arg, "errors") == 0 )
                tidyOptSetBool( tdoc, TidyShowMarkup, no );

            else if ( strcasecmp(arg, "quiet") == 0 )
                tidyOptSetBool( tdoc, TidyQuiet, yes );

            else if ( strcasecmp(arg, "help") == 0 ||
                      strcasecmp(arg,    "h") == 0 || *arg == '?' )
            {
                help( prog );
                tidyRelease( tdoc );
                return 0; /* success */
            }
            else if ( strcasecmp(arg, "xml-help") == 0)
            {
                xml_help( );
                tidyRelease( tdoc );
                return 0; /* success */
            }
            else if ( strcasecmp(arg, "help-config") == 0 )
            {
                optionhelp( tdoc );
                tidyRelease( tdoc );
                return 0; /* success */
            }
            else if ( strcasecmp(arg, "xml-config") == 0 )
            {
                XMLoptionhelp( tdoc );
                tidyRelease( tdoc );
                return 0; /* success */
            }
            else if ( strcasecmp(arg, "show-config") == 0 )
            {
                optionvalues( tdoc );
                tidyRelease( tdoc );
                return 0; /* success */
            }
            else if ( strcasecmp(arg, "config") == 0 )
            {
                if ( argc >= 3 )
                {
                    ctmbstr post;

                    tidyLoadConfig( tdoc, argv[2] );

                    /* Set new error output stream if setting changed */
                    post = tidyOptGetValue( tdoc, TidyErrFile );
                    if ( post && (!errfil || !samefile(errfil, post)) )
                    {
                        errfil = post;
                        errout = tidySetErrorFile( tdoc, post );
                    }

                    --argc;
                    ++argv;
                }
            }

#if SUPPORT_ASIAN_ENCODINGS
            else if ( strcasecmp(arg, "language") == 0 ||
                      strcasecmp(arg,     "lang") == 0 )
            {
                if ( argc >= 3 )
                {
                    tidyOptSetValue( tdoc, TidyLanguage, argv[2] );
                    --argc;
                    ++argv;
                }
            }
#endif

            else if ( strcasecmp(arg, "output") == 0 ||
                      strcasecmp(arg, "-output-file") == 0 ||
                      strcasecmp(arg, "o") == 0 )
            {
                if ( argc >= 3 )
                {
                    tidyOptSetValue( tdoc, TidyOutFile, argv[2] );
                    --argc;
                    ++argv;
                }
            }
            else if ( strcasecmp(arg,  "file") == 0 ||
                      strcasecmp(arg, "-file") == 0 ||
                      strcasecmp(arg,     "f") == 0 )
            {
                if ( argc >= 3 )
                {
                    errfil = argv[2];
                    errout = tidySetErrorFile( tdoc, errfil );
                    --argc;
                    ++argv;
                }
            }
            else if ( strcasecmp(arg,  "wrap") == 0 ||
                      strcasecmp(arg, "-wrap") == 0 ||
                      strcasecmp(arg,     "w") == 0 )
            {
                if ( argc >= 3 )
                {
                    uint wraplen = 0;
                    int nfields = sscanf( argv[2], "%u", &wraplen );
                    tidyOptSetInt( tdoc, TidyWrapLen, wraplen );
                    if (nfields > 0)
                    {
                        --argc;
                        ++argv;
                    }
                }
            }
            else if ( strcasecmp(arg,  "version") == 0 ||
                      strcasecmp(arg, "-version") == 0 ||
                      strcasecmp(arg,        "v") == 0 )
            {
                version();
                tidyRelease( tdoc );
                return 0;  /* success */

            }
            else if ( strncmp(argv[1], "--", 2 ) == 0)
            {
                if ( tidyOptParseValue(tdoc, argv[1]+2, argv[2]) )
                {
                    /* Set new error output stream if setting changed */
                    ctmbstr post = tidyOptGetValue( tdoc, TidyErrFile );
                    if ( post && (!errfil || !samefile(errfil, post)) )
                    {
                        errfil = post;
                        errout = tidySetErrorFile( tdoc, post );
                    }

                    ++argv;
                    --argc;
                }
            }

#if SUPPORT_ACCESSIBILITY_CHECKS
            else if ( strcasecmp(arg, "access") == 0 )
            {
                if ( argc >= 3 )
                {
                    uint acclvl = 0;
                    int nfields = sscanf( argv[2], "%u", &acclvl );
                    tidyOptSetInt( tdoc, TidyAccessibilityCheckLevel, acclvl );
                    if (nfields > 0)
                    {
                        --argc;
                        ++argv;
                    }
                }
            }
#endif

            else
            {
                uint c;
                ctmbstr s = argv[1];

                while ( (c = *++s) != '\0' )
                {
                    switch ( c )
                    {
                    case 'i':
                        tidyOptSetInt( tdoc, TidyIndentContent, TidyAutoState );
                        if ( tidyOptGetInt(tdoc, TidyIndentSpaces) == 0 )
                            tidyOptResetToDefault( tdoc, TidyIndentSpaces );
                        break;

                    /* Usurp -o for output file.  Anyone hiding end tags?
                    case 'o':
                        tidyOptSetBool( tdoc, TidyHideEndTags, yes );
                        break;
                    */

                    case 'u':
                        tidyOptSetBool( tdoc, TidyUpperCaseTags, yes );
                        break;

                    case 'c':
                        tidyOptSetBool( tdoc, TidyMakeClean, yes );
                        break;

                    case 'b':
                        tidyOptSetBool( tdoc, TidyMakeBare, yes );
                        break;

                    case 'n':
                        tidyOptSetBool( tdoc, TidyNumEntities, yes );
                        break;

                    case 'm':
                        tidyOptSetBool( tdoc, TidyWriteBack, yes );
                        break;

                    case 'e':
                        tidyOptSetBool( tdoc, TidyShowMarkup, no );
                        break;

                    case 'q':
                        tidyOptSetBool( tdoc, TidyQuiet, yes );
                        break;

                    default:
                        unknownOption( c );
                        break;
                    }
                }
            }

            --argc;
            ++argv;
            continue;
        }

        if ( argc > 1 )
        {
            htmlfil = argv[1];
            if ( tidyOptGetBool(tdoc, TidyEmacs) )
                tidyOptSetValue( tdoc, TidyEmacsFile, htmlfil );
            status = tidyParseFile( tdoc, htmlfil );
        }
        else
        {
            htmlfil = "stdin";
            status = tidyParseStdin( tdoc );
        }

        if ( status >= 0 )
            status = tidyCleanAndRepair( tdoc );

        if ( status >= 0 )
            status = tidyRunDiagnostics( tdoc );

        if ( status > 1 ) /* If errors, do we want to force output? */
            status = ( tidyOptGetBool(tdoc, TidyForceOutput) ? status : -1 );

        if ( status >= 0 && tidyOptGetBool(tdoc, TidyShowMarkup) )
        {
            if ( tidyOptGetBool(tdoc, TidyWriteBack) && argc > 1 )
                status = tidySaveFile( tdoc, htmlfil );
            else
            {
                ctmbstr outfil = tidyOptGetValue( tdoc, TidyOutFile );
                if ( outfil )
                    status = tidySaveFile( tdoc, outfil );
                else
                    status = tidySaveStdout( tdoc );
            }
        }

        contentErrors   += tidyErrorCount( tdoc );
        contentWarnings += tidyWarningCount( tdoc );
        accessWarnings  += tidyAccessWarningCount( tdoc );

        --argc;
        ++argv;

        if ( argc <= 1 )
            break;
    }

    if (!tidyOptGetBool(tdoc, TidyQuiet) &&
        errout == stderr && !contentErrors)
        fprintf(errout, "\n");

    if (contentErrors + contentWarnings > 0 && 
         !tidyOptGetBool(tdoc, TidyQuiet))
        tidyErrorSummary(tdoc);

    if (!tidyOptGetBool(tdoc, TidyQuiet))
        tidyGeneralInfo(tdoc);

    /* called to free hash tables etc. */
    tidyRelease( tdoc );

    /* return status can be used by scripts */
    if ( contentErrors > 0 )
        return 2;

    if ( contentWarnings > 0 )
        return 1;

    /* 0 signifies all is ok */
    return 0;
}