void parse_doc_icu(char *s, int len, bool doHash, char *charset){ Xml xml; xml.set( s, len, TITLEREC_CURRENT_VERSION, 0, CT_HTML ); // Extract text from (x)html char *text_buf = (char*)malloc(64*1024); int32_t textLen = xml.getText( text_buf, 64 * 1024, 0, 99999999, doFilterSpaces ); Words w; w.set(text_buf, textLen, doHash); free(text_buf); }
void parse_doc_8859_1(char *s, int len, bool doHash,char *charset) { Xml xml; xml.set( s, len, TITLEREC_CURRENT_VERSION, 0, CT_HTML ); // Extract text from (x)html char *text_buf = (char*)malloc(len+1); xml.getText( text_buf, len, 0, 99999999, doFilterSpaces ); Words words; // just tokenize words words.set(text_buf, len, doHash); free(text_buf); }
void parse_doc_icu(char *s, int len, bool doHash, char *charset){ Xml xml; xml.set(csUTF8,s,len,false, 0,false, TITLEREC_CURRENT_VERSION); //fprintf(stderr,"\nparse_doc_icu\n"); // Extract text from (x)html char *text_buf = (char*)malloc(64*1024); long textLen = xml.getText(text_buf, 64*1024, 0, 99999999, false, true, false, doFilterSpaces, false); Words w; w.set(true,false, text_buf, textLen, TITLEREC_CURRENT_VERSION,doHash); free(text_buf); }
void parse_doc_8859_1(char *s, int len, bool doHash,char *charset) { Xml xml; xml.set(csASCII,s,len,false, 0, false, TITLEREC_CURRENT_VERSION); //fprintf(stderr,"\nparse_doc_8859_1\n"); // Extract text from (x)html char *text_buf = (char*)malloc(len+1); xml.getText(text_buf, len, 0, 99999999, false, true, false, doFilterSpaces, false); Words words; // just tokenize words words.set(false, text_buf, TITEREC_CURRENT_VERSION, doHash); free(text_buf); }