int do_index_xml ()
{
	char *ind_argv[] = {"indexer", "--config", SPHINX_CONFIG_FILE, SPHINX_INDEX_NAME};
	int ind_argc = PCHARSIZE(ind_argv);

	if ( mkdir( "/index", 0777 ) != 0 )
	{
		fprintf ( stderr, "Index directory not created\n" );
		return -1;
	}

	int ret_ind = indexer_main(ind_argc, ind_argv);
	return ret_ind;
}
int do_extract_text (char *input_file, char *output_file)
{
	char *ext = NULL;
	ext = (char *) getext_( input_file );

	if (ext == NULL)
		return 1;
	if ( (strcasecmp( ext, "doc" ) == 0) || (strcasecmp( ext, "rtf" ) == 0) )
	{
		char *argv_catdoc [] = {"catdoc", input_file};
		int argc_catdoc = PCHARSIZE( argv_catdoc );
		catdoc_main((int)argc_catdoc, argv_catdoc ); //
		//doc_to_text (input_file,  output_file ); // old doc extractor. antiword
	}
	else if (( strcasecmp( ext, "txt" ) == 0) || ( strcasecmp( ext, "sh" ) == 0) || ( strcasecmp( ext, "html" ) == 0) || ( strcasecmp( ext, "c" ) == 0) )
	{
		output_file [ strlen ( output_file ) - 4 ] = '\0';
	}
	else if ( strcasecmp( ext, "docx" ) == 0)
	{
		docx_to_text( input_file, output_file );
	}
	else if ( strcasecmp( ext, "odt" ) == 0)
	{
		docx_to_text( input_file, output_file );
	}
	else if ( strcasecmp( ext, "pdf" ) == 0)
	{
		pdf_to_text( input_file, output_file );
	}
	else
	{
		free (ext);
		return 1;
	}
	free (ext);
	return 0;
}
Esempio n. 3
0
void newbufferedpack_ (char *devname, char *dirname)
{
#define READDIR_FIX
#undef READDIR_FIX
	int fdpackfile;
	fdpackfile = open (devname, O_WRONLY | O_CREAT, S_IROTH | S_IWOTH | S_IRUSR | S_IWUSR);
	fdpackfile = open (devname, O_WRONLY | O_CREAT , S_IROTH | S_IWOTH | S_IRUSR | S_IWUSR);
	if ( fdpackfile  <= 0 )
	{
		printf ("*** ZVM Error open packfile (write)%s\n", devname);
		return;
	}

	char *indexpath;//deirectory with  index files and zspfinx.conf
	indexpath = dirname;
  	DIR *dir;
	struct dirent *entry;
	dir = opendir(indexpath);
	char *newpath;

#ifdef READDIR_FIX
	char *save_file_list [] = { "mainindex.sps", "mainindex.spm", "mainindex.spa", "mainindex.spk",
			"mainindex.spd", "mainindex.spp", "mainindex.spe", "mainindex.spi",
			"mainindex.sph", "zsphinx.conf" };
	int save_file_list_count = PCHARSIZE(save_file_list);
	int i = 0;
#endif


	if (!dir)
		printf ("*** ZVM Error open DIR %s\n", indexpath);
	int blocksize = 1024 * 64; // 10 Mb

	char *buff = NULL;
	buff = (char *) malloc (blocksize);

	long deltabytes = 0;
	long mainbytes = 0;
	int filecount = 0;

#ifndef READDIR_FIX
	while((entry = readdir(dir)))
	{
		if(entry->d_type != DT_DIR)
		{
#else
	for ( i = 0; i < save_file_list_count; i++)
	{
		{
#endif
			size_t size;
			size_t bread = 0;
			size_t bwrite;
			size_t bytecount;
			bytecount = 0;
			int fd;

#ifndef READDIR_FIX
			newpath = (char *) malloc (strlen (entry->d_name) + strlen(indexpath) + 2);
			sprintf(newpath, "%s/%s", indexpath, entry->d_name);
#else
			newpath = (char *) malloc (strlen ( save_file_list[i] ) + strlen(indexpath) + 2);
			sprintf(newpath, "%s/%s", indexpath, save_file_list[i]);
#endif
			fd = open (newpath, O_RDONLY);
			size = getfilesize_fd(fd, NULL, 0);
			printf ( "%s, %zu bytes\n", newpath, size );
			char tempstr [strlen (newpath) + 12];
			// write header (10 bytes size of filename + filename + 10 bytes size of filedata)
			sprintf(tempstr, "%10zu%s%10zu", strlen (newpath), newpath, size);
			bwrite = write (fdpackfile, tempstr, strlen (tempstr));
			// write header (10 bytes size of filename + filename)
			//read and write file data
			if (size > 0)
			{
				while ((bread = read(fd, buff, blocksize)) > 0)
				{
					bytecount += bread;
					bwrite = write(fdpackfile, buff, bread);
				}
			} else
				bytecount = 0;
			close (fd);
			filecount++;
		}
	}
	free (buff);
	close (fdpackfile);
}

int prepare_temp_dir (char *dir_name)
{
	if (mkdir ( dir_name, 0777 ) != 0 )
		return -1;
	return 0;
}

int check_dir_exist (char * dir_path )
{
	struct stat st;
	int err = stat(dir_path, &st);
	if(-1 == err) {
		if(ENOENT == errno) {
			return 0;
		} else {
			perror("stat");
			exit(1);
		}
	} else {
		if(S_ISDIR(st.st_mode)) {
			return 1;
		} else {
			return 2;
		}
	}
}

char * get_file_name_without_ext ( char * file_name )
{
	char *base_file_name = NULL;
	char *file_name_without_ext = NULL;
	int basename_len = 0, i = 0, file_name_without_ext_len = 0;

	base_file_name = basename ( file_name );
	basename_len = strlen ( base_file_name );

	for ( i = basename_len; i > 0; i-- )
	{
		if ( base_file_name[i] == '.' )
			break;
	}
	file_name_without_ext_len = basename_len - (basename_len - i);
	file_name_without_ext = (char *) malloc( sizeof ( char ) * ( file_name_without_ext_len + 1 ) );
	memcpy( file_name_without_ext, base_file_name, file_name_without_ext_len );
	file_name_without_ext[file_name_without_ext_len] = '\0';
	return file_name_without_ext;
}
int prepare_mbox ()
{
	char *argv_mbox [] = { "hypermail", "-m", OBJECT_DEVICE_NAME, "-d", TEMP_DIR };
	int argc_mbox = PCHARSIZE(argv_mbox);
	return main_mbox( argc_mbox, argv_mbox );
}
int add_doc_to_xml ( int xml_fd, char *fileName, Input_Obj_Type tMode )
{
	struct stat st;
	struct tm *t;
	int ret = 0;
	char *xml_ = NULL;
	char *text = NULL;
	char *fileLenBuff = NULL;
	char *tmpFile = NULL;
	char *file_name_in_fs = NULL;
	size_t size_text = 0;

	if ( tMode == zip_obj )
	{
		if ( get_one_file_from_zip ( OBJECT_DEVICE_NAME, fileName ) != 0 )
		{
			return -1;
		}
		file_name_in_fs = malloc( CHARSIZE( strlen ( fileName ) + strlen ( TEMP_DIR ) + 2) );
		sprintf ( file_name_in_fs, "%s/%s", TEMP_DIR, fileName );
	}
	else if ( tMode == mail_obj )
	{
		file_name_in_fs = fileName;
	}

	if ( ( ret = stat( file_name_in_fs, &st ) ) != 0 )
	{
		fprintf( stderr, "stat failure error %d\n", ret );
		return -1;
	}

	fileLenBuff = (char *) malloc( CHARSIZE(50) );
	sprintf( fileLenBuff, "%zu", st.st_size );

	tmpFile = (char *) malloc( CHARSIZE( strlen ( file_name_in_fs ) + 10) );
	sprintf( tmpFile, "%s.tmp", file_name_in_fs );


	if ( do_extract_text( file_name_in_fs, tmpFile ) == 1 )
	{
		text = NULL;
		size_text = 0;
	}
	else
	{
		text = get_text_from_file( tmpFile, &size_text );
		filtering_buff( text, size_text );
	}
	if ( tMode == zip_obj )
	{
		remove( file_name_in_fs );
		free (file_name_in_fs);
		write_doc_toxml( xml_fd, num_CRC32( fileName ), text, size_text, fileName, fileLenBuff );
	}
	else if ( tMode == mail_obj )
	{
		char *base_name_without_ext = NULL;
		char *temp_dir_name = NULL;
		char *temp_base_name = NULL;
		char *temp_file_name = NULL;
		char *temp_pos = NULL;
		char *temp_filepath = NULL;
		char *temp_message_id = NULL;
		char *temp_full_file_name = NULL;
		char *use_file = NULL;
		char *do_not_index[] = { "date", "index", "subject", "author", "attachment" };
		int do_not_index_count = PCHARSIZE(do_not_index);
		int i = 0;
		int skip_indexing = 0;
		base_name_without_ext = get_file_name_without_ext( fileName );
		for( i = 0; i < do_not_index_count; i++ )
		{
			if ( strcasecmp( base_name_without_ext, do_not_index[i] ) == 0 )
			{
				skip_indexing = 1;
				break;
			}
		}
		if ( skip_indexing != 0 )
		{
			free( fileLenBuff );
			free( tmpFile );
			free( text );
			return 0;
		}
		// BUG in glibc. we need to save fileName
		temp_filepath = strdup( fileName );
		temp_dir_name = dirname( temp_filepath );
		temp_base_name = basename( temp_dir_name );
		temp_pos = NULL;
		temp_pos = strstr( temp_base_name, "-" );

		char *from_field = NULL;
		char *subj_field = NULL;
		char *offset_attr = NULL;
		char *sent_ch = NULL;
		char *recv_ch = NULL;
		time_t sent_tm_t;
		time_t recv_tm_t;
		char *message_id_attr = NULL;

		if ( temp_pos != NULL ) // if attachment
		{
			temp_pos++;
			temp_file_name = (char *) malloc( CHARSIZE( strlen ( TEMP_DIR ) + strlen ( temp_base_name ) + 20) );
			sprintf( temp_file_name, "%s/%s.html", TEMP_DIR, temp_pos );
			temp_message_id = get_message_ID_from_html( temp_file_name );
			temp_full_file_name = (char *) malloc( CHARSIZE( strlen (temp_message_id) + strlen ( fileName ) + 5 ) );
			sprintf( temp_full_file_name, "%s/%s", temp_message_id, fileName );
			use_file = temp_file_name;
		}
		else // if message body
		{
			temp_full_file_name = get_message_ID_from_html( fileName );
			use_file = fileName;
		}

		from_field = get_element_from_html( use_file, "email" );
		subj_field = get_element_from_html( use_file, "subject" );
		offset_attr = get_element_from_html( use_file, "offset_email" );
		sent_ch = get_element_from_html( use_file, "isosent" );
		recv_ch = get_element_from_html( use_file, "isoreceived" );
		message_id_attr = get_element_from_html( use_file, "id" );

		sent_tm_t = iso_to_secs( sent_ch );
		recv_tm_t = iso_to_secs( recv_ch );

		sprintf( sent_ch, "%ld", sent_tm_t );
		sprintf( recv_ch, "%ld", recv_tm_t );

		if ( subj_field != NULL )
			filtering_buff( subj_field, strlen ( subj_field ) );
		if ( from_field != NULL )
			filtering_buff( from_field, strlen( from_field ) );


		write_Email_message_to_xml( xml_fd, num_CRC32( temp_full_file_name ), text, size_text, from_field, subj_field, offset_attr, sent_ch, recv_ch,
				strchr( fileName, '/' ), message_id_attr );

		free( from_field );
		free( subj_field );
		free( offset_attr );
		free( sent_ch );
		free( recv_ch );
		free( message_id_attr );
		if ( temp_file_name )
			free( temp_file_name );
		free( temp_full_file_name );
	}
	free( fileLenBuff );
	free( tmpFile );
	free( text );

	return 0;
}
int add_doc_to_xml (int xml_fd, char *fileName, int doc_type)
{
	struct stat st;
	struct tm *t;
	int ret = 0;
	char *xml_ = NULL;
	char *text = NULL;
	char *fileLenBuff = NULL;
	char *tmpFile = NULL;
	size_t size_text = 0;

	if ((ret = stat(fileName , &st))!=0)
    {
		fprintf(stderr, "stat failure error .%d", ret);
		return -1;
    }

	fileLenBuff  = (char *) malloc ( CHARSIZE(50) );
	sprintf ( fileLenBuff, "%zu", st.st_size );

	tmpFile = (char *) malloc ( CHARSIZE( strlen ( fileName ) + 10) );
	sprintf ( tmpFile, "%s.tmp" , fileName );

	if (do_extract_text ( fileName, tmpFile ) == 1)
	{
		text = NULL;
		size_text = 0;
	}
	else
	{
		text = get_text_from_file( tmpFile, &size_text);
		filtering_buff ( text, size_text );
	}

	if ( doc_type == 1 )
		write_doc_toxml ( xml_fd, num_CRC32( fileName ), text, size_text, fileName, fileLenBuff );
	else if ( doc_type ==2 )
	{
		char *base_name_without_ext = NULL;
		char *temp_dir_name = NULL;
		char *temp_base_name = NULL;
		char *temp_file_name = NULL;
		char *temp_pos = NULL;
		char *temp_filepath = NULL;
		char *temp_message_id = NULL;
		char *temp_full_file_name = NULL;
		char *do_not_index[] = {"date", "index", "subject", "author", "attachment"};
		int do_not_index_count = PCHARSIZE(do_not_index);
		int i = 0;
		int skip_indexing = 0;
		base_name_without_ext = get_file_name_without_ext( fileName );
		for ( i = 0; i < do_not_index_count; i++ )
		{
			if (strcasecmp( base_name_without_ext , do_not_index[i]) == 0)
			{
				skip_indexing = 1;
				break;
			}
		}
		if ( skip_indexing != 0 )
		{
			free (fileLenBuff);
			free (tmpFile);
			free (text);
			return 0;
		}
		// BUG in glibc. we need to save fileName
		temp_filepath = strdup ( fileName );
		temp_dir_name = dirname( temp_filepath );
		temp_base_name  = basename ( temp_dir_name );
		temp_pos = NULL;
		temp_pos = strstr( temp_base_name, "-");
		if ( temp_pos != NULL )
		{
			temp_pos++;
			temp_file_name = ( char * ) malloc( CHARSIZE( strlen ( TEMP_DIR ) + strlen ( temp_base_name ) + 20) );
			sprintf ( temp_file_name, "%s/%s.html", TEMP_DIR, temp_pos);
			temp_message_id = get_message_ID_from_html ( temp_file_name );
			temp_full_file_name = (char *) malloc( CHARSIZE( strlen (temp_message_id) + strlen ( fileName ) + 5 ) );
			sprintf ( temp_full_file_name, "%s/%s", temp_message_id, fileName);
			free ( temp_file_name );
		}
		else
		{
			temp_full_file_name = get_message_ID_from_html ( fileName );
		}
		write_doc_toxml ( xml_fd, num_CRC32( temp_full_file_name ), text, size_text, fileName, fileLenBuff );
		free ( temp_full_file_name );
	}
	free (fileLenBuff);
	free (tmpFile);
	free (text);

	return 0;
}