Пример #1
0
int add_doc_to_xml ( int xml_fd, char *fileName, Input_Obj_Type tMode )
{
	struct stat st;
	struct tm *t;
	int ret = 0;
	char *xml_ = NULL;
	char *text = NULL;
	char *fileLenBuff = NULL;
	char *tmpFile = NULL;
	char *file_name_in_fs = NULL;
	size_t size_text = 0;

	if ( tMode == zip_obj )
	{
		if ( get_one_file_from_zip ( OBJECT_DEVICE_NAME, fileName ) != 0 )
		{
			return -1;
		}
		file_name_in_fs = malloc( CHARSIZE( strlen ( fileName ) + strlen ( TEMP_DIR ) + 2) );
		sprintf ( file_name_in_fs, "%s/%s", TEMP_DIR, fileName );
	}
	else if ( tMode == mail_obj )
	{
		file_name_in_fs = fileName;
	}

	if ( ( ret = stat( file_name_in_fs, &st ) ) != 0 )
	{
		fprintf( stderr, "stat failure error %d\n", ret );
		return -1;
	}

	fileLenBuff = (char *) malloc( CHARSIZE(50) );
	sprintf( fileLenBuff, "%zu", st.st_size );

	tmpFile = (char *) malloc( CHARSIZE( strlen ( file_name_in_fs ) + 10) );
	sprintf( tmpFile, "%s.tmp", file_name_in_fs );


	if ( do_extract_text( file_name_in_fs, tmpFile ) == 1 )
	{
		text = NULL;
		size_text = 0;
	}
	else
	{
		text = get_text_from_file( tmpFile, &size_text );
		filtering_buff( text, size_text );
	}
	if ( tMode == zip_obj )
	{
		remove( file_name_in_fs );
		free (file_name_in_fs);
		write_doc_toxml( xml_fd, num_CRC32( fileName ), text, size_text, fileName, fileLenBuff );
	}
	else if ( tMode == mail_obj )
	{
		char *base_name_without_ext = NULL;
		char *temp_dir_name = NULL;
		char *temp_base_name = NULL;
		char *temp_file_name = NULL;
		char *temp_pos = NULL;
		char *temp_filepath = NULL;
		char *temp_message_id = NULL;
		char *temp_full_file_name = NULL;
		char *use_file = NULL;
		char *do_not_index[] = { "date", "index", "subject", "author", "attachment" };
		int do_not_index_count = PCHARSIZE(do_not_index);
		int i = 0;
		int skip_indexing = 0;
		base_name_without_ext = get_file_name_without_ext( fileName );
		for( i = 0; i < do_not_index_count; i++ )
		{
			if ( strcasecmp( base_name_without_ext, do_not_index[i] ) == 0 )
			{
				skip_indexing = 1;
				break;
			}
		}
		if ( skip_indexing != 0 )
		{
			free( fileLenBuff );
			free( tmpFile );
			free( text );
			return 0;
		}
		// BUG in glibc. we need to save fileName
		temp_filepath = strdup( fileName );
		temp_dir_name = dirname( temp_filepath );
		temp_base_name = basename( temp_dir_name );
		temp_pos = NULL;
		temp_pos = strstr( temp_base_name, "-" );

		char *from_field = NULL;
		char *subj_field = NULL;
		char *offset_attr = NULL;
		char *sent_ch = NULL;
		char *recv_ch = NULL;
		time_t sent_tm_t;
		time_t recv_tm_t;
		char *message_id_attr = NULL;

		if ( temp_pos != NULL ) // if attachment
		{
			temp_pos++;
			temp_file_name = (char *) malloc( CHARSIZE( strlen ( TEMP_DIR ) + strlen ( temp_base_name ) + 20) );
			sprintf( temp_file_name, "%s/%s.html", TEMP_DIR, temp_pos );
			temp_message_id = get_message_ID_from_html( temp_file_name );
			temp_full_file_name = (char *) malloc( CHARSIZE( strlen (temp_message_id) + strlen ( fileName ) + 5 ) );
			sprintf( temp_full_file_name, "%s/%s", temp_message_id, fileName );
			use_file = temp_file_name;
		}
		else // if message body
		{
			temp_full_file_name = get_message_ID_from_html( fileName );
			use_file = fileName;
		}

		from_field = get_element_from_html( use_file, "email" );
		subj_field = get_element_from_html( use_file, "subject" );
		offset_attr = get_element_from_html( use_file, "offset_email" );
		sent_ch = get_element_from_html( use_file, "isosent" );
		recv_ch = get_element_from_html( use_file, "isoreceived" );
		message_id_attr = get_element_from_html( use_file, "id" );

		sent_tm_t = iso_to_secs( sent_ch );
		recv_tm_t = iso_to_secs( recv_ch );

		sprintf( sent_ch, "%ld", sent_tm_t );
		sprintf( recv_ch, "%ld", recv_tm_t );

		if ( subj_field != NULL )
			filtering_buff( subj_field, strlen ( subj_field ) );
		if ( from_field != NULL )
			filtering_buff( from_field, strlen( from_field ) );


		write_Email_message_to_xml( xml_fd, num_CRC32( temp_full_file_name ), text, size_text, from_field, subj_field, offset_attr, sent_ch, recv_ch,
				strchr( fileName, '/' ), message_id_attr );

		free( from_field );
		free( subj_field );
		free( offset_attr );
		free( sent_ch );
		free( recv_ch );
		free( message_id_attr );
		if ( temp_file_name )
			free( temp_file_name );
		free( temp_full_file_name );
	}
	free( fileLenBuff );
	free( tmpFile );
	free( text );

	return 0;
}
int add_doc_to_xml (int xml_fd, char *fileName, int doc_type)
{
	struct stat st;
	struct tm *t;
	int ret = 0;
	char *xml_ = NULL;
	char *text = NULL;
	char *fileLenBuff = NULL;
	char *tmpFile = NULL;
	size_t size_text = 0;

	if ((ret = stat(fileName , &st))!=0)
    {
		fprintf(stderr, "stat failure error .%d", ret);
		return -1;
    }

	fileLenBuff  = (char *) malloc ( CHARSIZE(50) );
	sprintf ( fileLenBuff, "%zu", st.st_size );

	tmpFile = (char *) malloc ( CHARSIZE( strlen ( fileName ) + 10) );
	sprintf ( tmpFile, "%s.tmp" , fileName );

	if (do_extract_text ( fileName, tmpFile ) == 1)
	{
		text = NULL;
		size_text = 0;
	}
	else
	{
		text = get_text_from_file( tmpFile, &size_text);
		filtering_buff ( text, size_text );
	}

	if ( doc_type == 1 )
		write_doc_toxml ( xml_fd, num_CRC32( fileName ), text, size_text, fileName, fileLenBuff );
	else if ( doc_type ==2 )
	{
		char *base_name_without_ext = NULL;
		char *temp_dir_name = NULL;
		char *temp_base_name = NULL;
		char *temp_file_name = NULL;
		char *temp_pos = NULL;
		char *temp_filepath = NULL;
		char *temp_message_id = NULL;
		char *temp_full_file_name = NULL;
		char *do_not_index[] = {"date", "index", "subject", "author", "attachment"};
		int do_not_index_count = PCHARSIZE(do_not_index);
		int i = 0;
		int skip_indexing = 0;
		base_name_without_ext = get_file_name_without_ext( fileName );
		for ( i = 0; i < do_not_index_count; i++ )
		{
			if (strcasecmp( base_name_without_ext , do_not_index[i]) == 0)
			{
				skip_indexing = 1;
				break;
			}
		}
		if ( skip_indexing != 0 )
		{
			free (fileLenBuff);
			free (tmpFile);
			free (text);
			return 0;
		}
		// BUG in glibc. we need to save fileName
		temp_filepath = strdup ( fileName );
		temp_dir_name = dirname( temp_filepath );
		temp_base_name  = basename ( temp_dir_name );
		temp_pos = NULL;
		temp_pos = strstr( temp_base_name, "-");
		if ( temp_pos != NULL )
		{
			temp_pos++;
			temp_file_name = ( char * ) malloc( CHARSIZE( strlen ( TEMP_DIR ) + strlen ( temp_base_name ) + 20) );
			sprintf ( temp_file_name, "%s/%s.html", TEMP_DIR, temp_pos);
			temp_message_id = get_message_ID_from_html ( temp_file_name );
			temp_full_file_name = (char *) malloc( CHARSIZE( strlen (temp_message_id) + strlen ( fileName ) + 5 ) );
			sprintf ( temp_full_file_name, "%s/%s", temp_message_id, fileName);
			free ( temp_file_name );
		}
		else
		{
			temp_full_file_name = get_message_ID_from_html ( fileName );
		}
		write_doc_toxml ( xml_fd, num_CRC32( temp_full_file_name ), text, size_text, fileName, fileLenBuff );
		free ( temp_full_file_name );
	}
	free (fileLenBuff);
	free (tmpFile);
	free (text);

	return 0;
}