int docs_to_xml (char * path, Input_Obj_Type tMode ) { int i = 0; SingleList_t tFileList, *pFileList=&tFileList, tFileTypeFilter, *pFileTypeFilter = &tFileTypeFilter; int xml_fd; char *ext = NULL; initList( pFileList ); setFileTypeFilter ( pFileTypeFilter ); #ifndef OLD_ZRT if ( strcmp ( getext_( path ), "zip") == 0 ) { get_file_list_inzip ( path , pFileList ); } else #endif if ( tMode == zip_obj ) get_file_list_inzip( OBJECT_DEVICE_NAME, pFileList ); else if ( tMode == mail_obj ) get_file_list( path, pFileList, pFileTypeFilter ); else return 0; /* printf ( "list of indexed docs\n" ); for ( i = 0; i < pFileList->count; i++) printf ( "%s\n", pFileList->list[i] ); */ /////////////////////// xml_fd = open_xml_( XML_PATH, tMode ); //FIXME const for ( i = 0; i < pFileList->count; i++) { add_doc_to_xml ( xml_fd, pFileList->list[i], tMode ); } close_xml_( xml_fd ); /////////////////////// freeList( pFileList ); freeList( pFileTypeFilter ); return 0; }
int do_extract_text (char *input_file, char *output_file) { char *ext = NULL; ext = (char *) getext_( input_file ); if (ext == NULL) return 1; if ( (strcasecmp( ext, "doc" ) == 0) || (strcasecmp( ext, "rtf" ) == 0) ) { char *argv_catdoc [] = {"catdoc", input_file}; int argc_catdoc = PCHARSIZE( argv_catdoc ); catdoc_main((int)argc_catdoc, argv_catdoc ); // //doc_to_text (input_file, output_file ); // old doc extractor. antiword } else if (( strcasecmp( ext, "txt" ) == 0) || ( strcasecmp( ext, "sh" ) == 0) || ( strcasecmp( ext, "html" ) == 0) || ( strcasecmp( ext, "c" ) == 0) ) { output_file [ strlen ( output_file ) - 4 ] = '\0'; } else if ( strcasecmp( ext, "docx" ) == 0) { docx_to_text( input_file, output_file ); } else if ( strcasecmp( ext, "odt" ) == 0) { docx_to_text( input_file, output_file ); } else if ( strcasecmp( ext, "pdf" ) == 0) { pdf_to_text( input_file, output_file ); } else { free (ext); return 1; } free (ext); return 0; }
int docs_to_xml (char * path, int doc_type) { int i = 0; SingleList_t tFileList, *pFileList=&tFileList, tFileTypeFilter, *pFileTypeFilter = &tFileTypeFilter; int xml_fd; char *ext = NULL; initList( pFileList ); setFileTypeFilter ( pFileTypeFilter ); #ifndef OLD_ZRT if ( strcmp ( getext_( path ), "zip") == 0 ) { get_file_list_inzip ( path , pFileList ); } else #endif print_dir_tree (path); get_file_list ( path, pFileList, pFileTypeFilter ); printf ( "list of indexed docs\n" ); for ( i = 0; i < pFileList->count; i++) printf ( "%s\n", pFileList->list[i] ); /////////////////////// xml_fd = open_xml_( XML_PATH ); //FIXME const for ( i = 0; i < pFileList->count; i++) { add_doc_to_xml ( xml_fd, pFileList->list[i], doc_type ); } close_xml_( xml_fd ); /////////////////////// freeList( pFileList ); freeList( pFileTypeFilter ); return 0; }