int main(int argc, char *argv[]) { char *encoding = NULL; bool doHash = false; if (argc < 2){ fprintf(stderr, "Usage: %s [ -e encoding ] filename [ [ -e encoding] filename2 ] ...\n", argv[0]); exit(1); } ucInit(); if ( ! hashinit() ) { log("db: Failed to init hashtable." ); return 1; } // . hashinit() calls srand() w/ a fixed number // . let's mix it up again srand ( time(NULL) ); int i; int flag = flNone; void (*parser)(char*,int,bool,char*) = NULL; char *parser_str = ""; for (i=1;i<argc;i++){ // Read cmdline args if (argv[i][0] == '-'){ if (gbstrlen(argv[i])<2){ fprintf(stderr, "Unknown argument: %s\n", argv[i]); exit(1); } switch(argv[i][1]){ case 'e': flag = flEncoding; break; case 'p': flag = flParser; break; case 'h': flag = flHash; break; case 's': flag = flFilterSpaces; break; default: fprintf(stderr, "Unknown flag: %s\n", argv[i]); exit(1); } continue; //next arg } // Switch default encoding if (flag == flEncoding){ encoding = argv[i]; flag=flNone; fprintf(stderr, "Using encoding: %s\n", encoding); continue; } // switch parser if (flag == flParser){ if (!strncmp(argv[i], "icu", 3)){ parser = parse_doc_icu; parser_str = "ICU BreakIterator"; } else { parser = parse_doc_8859_1; parser_str = "iso-8859-1"; } flag=flNone; fprintf(stderr, "Using parser: %s\n", parser_str); continue; } if (flag == flHash){ if ((!strncmp(argv[i], "0", 1)) || (!strncmp(argv[i], "f", 1))){ doHash = false; } else{ doHash = true; } flag = flNone; continue; } if (flag == flFilterSpaces){ if ((!strncmp(argv[i], "0", 1)) || (!strncmp(argv[i], "f", 1))){ doFilterSpaces = false; } else{ doFilterSpaces = true; } flag = flNone; continue; } char * filename = argv[i]; fprintf(stderr, "Reading \"%s\"\n", filename); FILE *fp = fopen(filename,"r"); if (!fp){ fprintf(stderr, "Error: could not open file \"%s\"\n", filename); continue; } // Get File size size_t file_size; fseek(fp, 0L, SEEK_END); file_size = (size_t)ftell(fp); fseek(fp, 0L, SEEK_SET); char *file_buf = (char*)malloc(file_size+1); size_t nread = fread(file_buf, (size_t)1,file_size, fp); fclose(fp); if (nread != file_size){ fprintf(stderr, "Warning: wanted %d chars, but read %d\n", file_size, nread); } file_buf[nread] = '\0'; //struct timeval tv1, tv2; //struct timezone tz1, tz2; long usec_elapsed; // int testnum; char ucBuf[128*1024]; long ucLen = ucToUnicode((UChar*)ucBuf,128*1024, file_buf,nread+1,"utf-8", 10); ucLen <<= 1; usec_elapsed = time_parser(parser, file_buf,nread,doHash, encoding, NUM_TEST_RUNS); fprintf(stderr,"Document parsed (%s, hash=%s, filterSpaces=%s): %ld usec\n", parser_str, doHash?"true":"false", doFilterSpaces?"true":"false", usec_elapsed); } fprintf(stderr, "Done\n"); return 0; }
int main (int argc, char **argv) { char * filename = argv[1]; fprintf(stderr, "Reading \"%s\"\n", filename); FILE *fp = fopen(filename,"r"); if (!fp){ fprintf(stderr, "Error: could not open file \"%s\"\n", filename); exit(1); } //get charset char *charset = argv[2]; // Get File size size_t file_size; fseek(fp, 0L, SEEK_END); file_size = (size_t)ftell(fp); fseek(fp, 0L, SEEK_SET); char *file_buf = (char*)malloc(file_size+1); size_t nread = fread(file_buf, (size_t)1,file_size, fp); fclose(fp); if (nread != file_size){ fprintf(stderr, "Warning: wanted %d chars, but read %d\n", file_size, nread); } file_buf[nread] = '\0'; int32_t ucBufSize = (int32_t)(nread*2.5); UChar *ucBuf = (UChar*)malloc(ucBufSize); int32_t ucLen = ucToUnicode(ucBuf, ucBufSize, file_buf, nread, "utf-8", NULL); struct timeval tv1, tv2; struct timezone tz1, tz2; int32_t times[test_count]; int64_t total=0; int32_t max_time=-1L; int32_t min_time=999999999L; int32_t avg_time; //int32_t u8size = nread*2; //char *u8buf = (char*)malloc(u8size); int32_t newsize = 0; for (int i=0;i<test_count;i++ ){ gettimeofday(&tv1, &tz1); newsize = ucToUnicode(ucBuf, ucBufSize, file_buf, nread, charset, NULL) << 1; gettimeofday(&tv2, &tz2); times[i] = elapsed_usec(&tv1, &tv2); total += times[i]; if (times[i] < min_time) min_time = times[i]; if (times[i] > max_time) max_time = times[i]; } avg_time = total/test_count; fprintf(stderr,"ICU size: %"INT32", count: %"INT32", avg: %"INT32", min: %"INT32", max: %"INT32"\n", newsize, test_count, avg_time, min_time, max_time); int outfd = open("icu.out", O_CREAT|O_RDWR|O_TRUNC, 00666); if (outfd < 0) {printf("Error creating output file: %s\n", strerror(errno)); exit(1);} write(outfd, ucBuf, newsize); close(outfd); #if 0 total = 0; min_time = 999999999L; max_time = -1L; for (int i=0;i<test_count;i++ ){ gettimeofday(&tv1, &tz1); //newsize = utf16ToUtf8_iconv(u8buf, u8size, ucBuf, ucLen); newsize = ucToUnicode_iconv(ucBuf, ucBufSize, file_buf, nread, charset, NULL) << 1; gettimeofday(&tv2, &tz2); times[i] = elapsed_usec(&tv1, &tv2); total += times[i]; if (times[i] < min_time) min_time = times[i]; if (times[i] > max_time) max_time = times[i]; } avg_time = total/test_count; fprintf(stderr,"iconv size: %"INT32", count: %"INT32", avg: %"INT32", min: %"INT32", max: %"INT32"\n", newsize, test_count, avg_time, min_time, max_time); outfd = open("iconv.out", O_CREAT|O_RDWR|O_TRUNC, 00666); if (outfd < 0) {printf("Error creating output file: %s\n", strerror(errno)); exit(1);} write(outfd, ucBuf, newsize); close(outfd); #endif #if 0 total = 0; min_time = 999999999L; max_time = -1L; for (int i=0;i<test_count;i++ ){ gettimeofday(&tv1, &tz1); newsize = utf16ToUtf8_intern(u8buf, u8size, ucBuf, ucLen); gettimeofday(&tv2, &tz2); times[i] = elapsed_usec(&tv1, &tv2); total += times[i]; if (times[i] < min_time) min_time = times[i]; if (times[i] > max_time) max_time = times[i]; } avg_time = total/test_count; fprintf(stderr,"my size: %"INT32", count: %"INT32", avg: %"INT32", min: %"INT32", max: %"INT32"\n", newsize, test_count, avg_time, min_time, max_time); outfd = open("my.out", O_CREAT|O_RDWR|O_TRUNC, 00666); if (outfd < 0) {printf("Error creating output file: %s\n", strerror(errno)); exit(1);} write(outfd, u8buf, newsize); close(outfd); #endif //printf("%s\n", u8buf); }