int do_index_xml () { char *ind_argv[] = {"indexer", "--config", SPHINX_CONFIG_FILE, SPHINX_INDEX_NAME}; int ind_argc = PCHARSIZE(ind_argv); if ( mkdir( "/index", 0777 ) != 0 ) { fprintf ( stderr, "Index directory not created\n" ); return -1; } int ret_ind = indexer_main(ind_argc, ind_argv); return ret_ind; }
int do_extract_text (char *input_file, char *output_file) { char *ext = NULL; ext = (char *) getext_( input_file ); if (ext == NULL) return 1; if ( (strcasecmp( ext, "doc" ) == 0) || (strcasecmp( ext, "rtf" ) == 0) ) { char *argv_catdoc [] = {"catdoc", input_file}; int argc_catdoc = PCHARSIZE( argv_catdoc ); catdoc_main((int)argc_catdoc, argv_catdoc ); // //doc_to_text (input_file, output_file ); // old doc extractor. antiword } else if (( strcasecmp( ext, "txt" ) == 0) || ( strcasecmp( ext, "sh" ) == 0) || ( strcasecmp( ext, "html" ) == 0) || ( strcasecmp( ext, "c" ) == 0) ) { output_file [ strlen ( output_file ) - 4 ] = '\0'; } else if ( strcasecmp( ext, "docx" ) == 0) { docx_to_text( input_file, output_file ); } else if ( strcasecmp( ext, "odt" ) == 0) { docx_to_text( input_file, output_file ); } else if ( strcasecmp( ext, "pdf" ) == 0) { pdf_to_text( input_file, output_file ); } else { free (ext); return 1; } free (ext); return 0; }
void newbufferedpack_ (char *devname, char *dirname) { #define READDIR_FIX #undef READDIR_FIX int fdpackfile; fdpackfile = open (devname, O_WRONLY | O_CREAT, S_IROTH | S_IWOTH | S_IRUSR | S_IWUSR); fdpackfile = open (devname, O_WRONLY | O_CREAT , S_IROTH | S_IWOTH | S_IRUSR | S_IWUSR); if ( fdpackfile <= 0 ) { printf ("*** ZVM Error open packfile (write)%s\n", devname); return; } char *indexpath;//deirectory with index files and zspfinx.conf indexpath = dirname; DIR *dir; struct dirent *entry; dir = opendir(indexpath); char *newpath; #ifdef READDIR_FIX char *save_file_list [] = { "mainindex.sps", "mainindex.spm", "mainindex.spa", "mainindex.spk", "mainindex.spd", "mainindex.spp", "mainindex.spe", "mainindex.spi", "mainindex.sph", "zsphinx.conf" }; int save_file_list_count = PCHARSIZE(save_file_list); int i = 0; #endif if (!dir) printf ("*** ZVM Error open DIR %s\n", indexpath); int blocksize = 1024 * 64; // 10 Mb char *buff = NULL; buff = (char *) malloc (blocksize); long deltabytes = 0; long mainbytes = 0; int filecount = 0; #ifndef READDIR_FIX while((entry = readdir(dir))) { if(entry->d_type != DT_DIR) { #else for ( i = 0; i < save_file_list_count; i++) { { #endif size_t size; size_t bread = 0; size_t bwrite; size_t bytecount; bytecount = 0; int fd; #ifndef READDIR_FIX newpath = (char *) malloc (strlen (entry->d_name) + strlen(indexpath) + 2); sprintf(newpath, "%s/%s", indexpath, entry->d_name); #else newpath = (char *) malloc (strlen ( save_file_list[i] ) + strlen(indexpath) + 2); sprintf(newpath, "%s/%s", indexpath, save_file_list[i]); #endif fd = open (newpath, O_RDONLY); size = getfilesize_fd(fd, NULL, 0); printf ( "%s, %zu bytes\n", newpath, size ); char tempstr [strlen (newpath) + 12]; // write header (10 bytes size of filename + filename + 10 bytes size of filedata) sprintf(tempstr, "%10zu%s%10zu", strlen (newpath), newpath, size); bwrite = write (fdpackfile, tempstr, strlen (tempstr)); // write header (10 bytes size of filename + filename) //read and write file data if (size > 0) { while ((bread = read(fd, buff, blocksize)) > 0) { bytecount += bread; bwrite = write(fdpackfile, buff, bread); } } else bytecount = 0; close (fd); filecount++; } } free (buff); close (fdpackfile); } int prepare_temp_dir (char *dir_name) { if (mkdir ( dir_name, 0777 ) != 0 ) return -1; return 0; } int check_dir_exist (char * dir_path ) { struct stat st; int err = stat(dir_path, &st); if(-1 == err) { if(ENOENT == errno) { return 0; } else { perror("stat"); exit(1); } } else { if(S_ISDIR(st.st_mode)) { return 1; } else { return 2; } } } char * get_file_name_without_ext ( char * file_name ) { char *base_file_name = NULL; char *file_name_without_ext = NULL; int basename_len = 0, i = 0, file_name_without_ext_len = 0; base_file_name = basename ( file_name ); basename_len = strlen ( base_file_name ); for ( i = basename_len; i > 0; i-- ) { if ( base_file_name[i] == '.' ) break; } file_name_without_ext_len = basename_len - (basename_len - i); file_name_without_ext = (char *) malloc( sizeof ( char ) * ( file_name_without_ext_len + 1 ) ); memcpy( file_name_without_ext, base_file_name, file_name_without_ext_len ); file_name_without_ext[file_name_without_ext_len] = '\0'; return file_name_without_ext; }
int prepare_mbox () { char *argv_mbox [] = { "hypermail", "-m", OBJECT_DEVICE_NAME, "-d", TEMP_DIR }; int argc_mbox = PCHARSIZE(argv_mbox); return main_mbox( argc_mbox, argv_mbox ); }
int add_doc_to_xml ( int xml_fd, char *fileName, Input_Obj_Type tMode ) { struct stat st; struct tm *t; int ret = 0; char *xml_ = NULL; char *text = NULL; char *fileLenBuff = NULL; char *tmpFile = NULL; char *file_name_in_fs = NULL; size_t size_text = 0; if ( tMode == zip_obj ) { if ( get_one_file_from_zip ( OBJECT_DEVICE_NAME, fileName ) != 0 ) { return -1; } file_name_in_fs = malloc( CHARSIZE( strlen ( fileName ) + strlen ( TEMP_DIR ) + 2) ); sprintf ( file_name_in_fs, "%s/%s", TEMP_DIR, fileName ); } else if ( tMode == mail_obj ) { file_name_in_fs = fileName; } if ( ( ret = stat( file_name_in_fs, &st ) ) != 0 ) { fprintf( stderr, "stat failure error %d\n", ret ); return -1; } fileLenBuff = (char *) malloc( CHARSIZE(50) ); sprintf( fileLenBuff, "%zu", st.st_size ); tmpFile = (char *) malloc( CHARSIZE( strlen ( file_name_in_fs ) + 10) ); sprintf( tmpFile, "%s.tmp", file_name_in_fs ); if ( do_extract_text( file_name_in_fs, tmpFile ) == 1 ) { text = NULL; size_text = 0; } else { text = get_text_from_file( tmpFile, &size_text ); filtering_buff( text, size_text ); } if ( tMode == zip_obj ) { remove( file_name_in_fs ); free (file_name_in_fs); write_doc_toxml( xml_fd, num_CRC32( fileName ), text, size_text, fileName, fileLenBuff ); } else if ( tMode == mail_obj ) { char *base_name_without_ext = NULL; char *temp_dir_name = NULL; char *temp_base_name = NULL; char *temp_file_name = NULL; char *temp_pos = NULL; char *temp_filepath = NULL; char *temp_message_id = NULL; char *temp_full_file_name = NULL; char *use_file = NULL; char *do_not_index[] = { "date", "index", "subject", "author", "attachment" }; int do_not_index_count = PCHARSIZE(do_not_index); int i = 0; int skip_indexing = 0; base_name_without_ext = get_file_name_without_ext( fileName ); for( i = 0; i < do_not_index_count; i++ ) { if ( strcasecmp( base_name_without_ext, do_not_index[i] ) == 0 ) { skip_indexing = 1; break; } } if ( skip_indexing != 0 ) { free( fileLenBuff ); free( tmpFile ); free( text ); return 0; } // BUG in glibc. we need to save fileName temp_filepath = strdup( fileName ); temp_dir_name = dirname( temp_filepath ); temp_base_name = basename( temp_dir_name ); temp_pos = NULL; temp_pos = strstr( temp_base_name, "-" ); char *from_field = NULL; char *subj_field = NULL; char *offset_attr = NULL; char *sent_ch = NULL; char *recv_ch = NULL; time_t sent_tm_t; time_t recv_tm_t; char *message_id_attr = NULL; if ( temp_pos != NULL ) // if attachment { temp_pos++; temp_file_name = (char *) malloc( CHARSIZE( strlen ( TEMP_DIR ) + strlen ( temp_base_name ) + 20) ); sprintf( temp_file_name, "%s/%s.html", TEMP_DIR, temp_pos ); temp_message_id = get_message_ID_from_html( temp_file_name ); temp_full_file_name = (char *) malloc( CHARSIZE( strlen (temp_message_id) + strlen ( fileName ) + 5 ) ); sprintf( temp_full_file_name, "%s/%s", temp_message_id, fileName ); use_file = temp_file_name; } else // if message body { temp_full_file_name = get_message_ID_from_html( fileName ); use_file = fileName; } from_field = get_element_from_html( use_file, "email" ); subj_field = get_element_from_html( use_file, "subject" ); offset_attr = get_element_from_html( use_file, "offset_email" ); sent_ch = get_element_from_html( use_file, "isosent" ); recv_ch = get_element_from_html( use_file, "isoreceived" ); message_id_attr = get_element_from_html( use_file, "id" ); sent_tm_t = iso_to_secs( sent_ch ); recv_tm_t = iso_to_secs( recv_ch ); sprintf( sent_ch, "%ld", sent_tm_t ); sprintf( recv_ch, "%ld", recv_tm_t ); if ( subj_field != NULL ) filtering_buff( subj_field, strlen ( subj_field ) ); if ( from_field != NULL ) filtering_buff( from_field, strlen( from_field ) ); write_Email_message_to_xml( xml_fd, num_CRC32( temp_full_file_name ), text, size_text, from_field, subj_field, offset_attr, sent_ch, recv_ch, strchr( fileName, '/' ), message_id_attr ); free( from_field ); free( subj_field ); free( offset_attr ); free( sent_ch ); free( recv_ch ); free( message_id_attr ); if ( temp_file_name ) free( temp_file_name ); free( temp_full_file_name ); } free( fileLenBuff ); free( tmpFile ); free( text ); return 0; }
int add_doc_to_xml (int xml_fd, char *fileName, int doc_type) { struct stat st; struct tm *t; int ret = 0; char *xml_ = NULL; char *text = NULL; char *fileLenBuff = NULL; char *tmpFile = NULL; size_t size_text = 0; if ((ret = stat(fileName , &st))!=0) { fprintf(stderr, "stat failure error .%d", ret); return -1; } fileLenBuff = (char *) malloc ( CHARSIZE(50) ); sprintf ( fileLenBuff, "%zu", st.st_size ); tmpFile = (char *) malloc ( CHARSIZE( strlen ( fileName ) + 10) ); sprintf ( tmpFile, "%s.tmp" , fileName ); if (do_extract_text ( fileName, tmpFile ) == 1) { text = NULL; size_text = 0; } else { text = get_text_from_file( tmpFile, &size_text); filtering_buff ( text, size_text ); } if ( doc_type == 1 ) write_doc_toxml ( xml_fd, num_CRC32( fileName ), text, size_text, fileName, fileLenBuff ); else if ( doc_type ==2 ) { char *base_name_without_ext = NULL; char *temp_dir_name = NULL; char *temp_base_name = NULL; char *temp_file_name = NULL; char *temp_pos = NULL; char *temp_filepath = NULL; char *temp_message_id = NULL; char *temp_full_file_name = NULL; char *do_not_index[] = {"date", "index", "subject", "author", "attachment"}; int do_not_index_count = PCHARSIZE(do_not_index); int i = 0; int skip_indexing = 0; base_name_without_ext = get_file_name_without_ext( fileName ); for ( i = 0; i < do_not_index_count; i++ ) { if (strcasecmp( base_name_without_ext , do_not_index[i]) == 0) { skip_indexing = 1; break; } } if ( skip_indexing != 0 ) { free (fileLenBuff); free (tmpFile); free (text); return 0; } // BUG in glibc. we need to save fileName temp_filepath = strdup ( fileName ); temp_dir_name = dirname( temp_filepath ); temp_base_name = basename ( temp_dir_name ); temp_pos = NULL; temp_pos = strstr( temp_base_name, "-"); if ( temp_pos != NULL ) { temp_pos++; temp_file_name = ( char * ) malloc( CHARSIZE( strlen ( TEMP_DIR ) + strlen ( temp_base_name ) + 20) ); sprintf ( temp_file_name, "%s/%s.html", TEMP_DIR, temp_pos); temp_message_id = get_message_ID_from_html ( temp_file_name ); temp_full_file_name = (char *) malloc( CHARSIZE( strlen (temp_message_id) + strlen ( fileName ) + 5 ) ); sprintf ( temp_full_file_name, "%s/%s", temp_message_id, fileName); free ( temp_file_name ); } else { temp_full_file_name = get_message_ID_from_html ( fileName ); } write_doc_toxml ( xml_fd, num_CRC32( temp_full_file_name ), text, size_text, fileName, fileLenBuff ); free ( temp_full_file_name ); } free (fileLenBuff); free (tmpFile); free (text); return 0; }