int make_ext4fs_internal(int fd, const char *directory,
                         char *mountpoint, fs_config_func_t fs_config_func, int gzip, int sparse,
                         int crc, int wipe, int init_itabs, struct selabel_handle *sehnd)
{
	u32 root_inode_num;
	u16 root_mode;

	if (setjmp(setjmp_env))
		return EXIT_FAILURE; /* Handle a call to longjmp() */

	if (info.len <= 0)
		info.len = get_file_size(fd);

	if (info.len <= 0) {
		fprintf(stderr, "Need size of filesystem\n");
		return EXIT_FAILURE;
	}

	if (info.block_size <= 0)
		info.block_size = compute_block_size();

	/* Round down the filesystem length to be a multiple of the block size */
	info.len &= ~((u64)info.block_size - 1);

	if (info.journal_blocks == 0)
		info.journal_blocks = compute_journal_blocks();

	if (info.no_journal == 0)
		info.feat_compat = EXT4_FEATURE_COMPAT_HAS_JOURNAL;
	else
		info.journal_blocks = 0;

	if (info.blocks_per_group <= 0)
		info.blocks_per_group = compute_blocks_per_group();

	if (info.inodes <= 0)
		info.inodes = compute_inodes();

	if (info.inode_size <= 0)
		info.inode_size = 256;

	if (info.label == NULL)
		info.label = "";

	info.inodes_per_group = compute_inodes_per_group();

	info.feat_compat |=
			EXT4_FEATURE_COMPAT_RESIZE_INODE;

	info.feat_ro_compat |=
			EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER |
			EXT4_FEATURE_RO_COMPAT_LARGE_FILE;

	info.feat_incompat |=
			EXT4_FEATURE_INCOMPAT_EXTENTS |
			EXT4_FEATURE_INCOMPAT_FILETYPE;


	info.bg_desc_reserve_blocks = compute_bg_desc_reserve_blocks();

	printf("Creating filesystem with parameters:\n");
	printf("    Size: %llu\n", info.len);
	printf("    Block size: %d\n", info.block_size);
	printf("    Blocks per group: %d\n", info.blocks_per_group);
	printf("    Inodes per group: %d\n", info.inodes_per_group);
	printf("    Inode size: %d\n", info.inode_size);
	printf("    Journal blocks: %d\n", info.journal_blocks);
	printf("    Label: %s\n", info.label);

	ext4_create_fs_aux_info();

	printf("    Blocks: %llu\n", aux_info.len_blocks);
	printf("    Block groups: %d\n", aux_info.groups);
	printf("    Reserved block group size: %d\n", info.bg_desc_reserve_blocks);

	info.sparse_file = sparse_file_new(info.block_size, info.len);

	block_allocator_init();

	ext4_fill_in_sb();
	MTK_add_mountpoint(aux_info.sb,mountpoint);

	if (reserve_inodes(0, 10) == EXT4_ALLOCATE_FAILED)
		error("failed to reserve first 10 inodes");

	if (info.feat_compat & EXT4_FEATURE_COMPAT_HAS_JOURNAL)
		ext4_create_journal_inode();

	if (info.feat_compat & EXT4_FEATURE_COMPAT_RESIZE_INODE)
		ext4_create_resize_inode();

#ifdef USE_MINGW
	// Windows needs only 'create an empty fs image' functionality
	assert(!directory);
	root_inode_num = build_default_directory_structure();
#else
	if (directory)
		root_inode_num = build_directory_structure(directory, mountpoint, 0,
                        fs_config_func, sehnd);
	else
		root_inode_num = build_default_directory_structure();
#endif

	root_mode = S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH;
	inode_set_permissions(root_inode_num, root_mode, 0, 0, 0);

#ifdef HAVE_SELINUX
	if (sehnd) {
		char *sepath = NULL;
		char *secontext = NULL;

		if (mountpoint[0] == '/')
			sepath = strdup(mountpoint);
		else
			asprintf(&sepath, "/%s", mountpoint);
		if (!sepath)
			critical_error_errno("malloc");
		if (selabel_lookup(sehnd, &secontext, sepath, S_IFDIR) < 0) {
			error("cannot lookup security context for %s", sepath);
		}
		if (secontext) {
			printf("Labeling %s as %s\n", sepath, secontext);
			inode_set_selinux(root_inode_num, secontext);
		}
		free(sepath);
		freecon(secontext);
	}
#endif

	ext4_update_free();

	if (init_itabs)
		init_unused_inode_tables();

	ext4_queue_sb();

	printf("Created filesystem with %d/%d inodes and %d/%d blocks\n",
			aux_info.sb->s_inodes_count - aux_info.sb->s_free_inodes_count,
			aux_info.sb->s_inodes_count,
			aux_info.sb->s_blocks_count_lo - aux_info.sb->s_free_blocks_count_lo,
			aux_info.sb->s_blocks_count_lo);

	if (wipe)
		wipe_block_device(fd, info.len);

	write_ext4_image(fd, gzip, sparse, crc);

	sparse_file_destroy(info.sparse_file);
	info.sparse_file = NULL;

	return 0;
}
Exemple #2
0
int make_ext4fs_internal(int fd, const char *_directory,
						 fs_config_func_t fs_config_func, int gzip,
						 int sparse, int crc, int wipe,
						 int verbose, time_t fixed_time,
						 FILE* block_list_file)
{
	u32 root_inode_num;
	u16 root_mode;
	char *directory = NULL;

	if (setjmp(setjmp_env))
		return EXIT_FAILURE; /* Handle a call to longjmp() */

	if (_directory == NULL) {
		fprintf(stderr, "Need a source directory\n");
		return EXIT_FAILURE;
	}

	directory = canonicalize_rel_slashes(_directory);

	if (info.len <= 0)
		info.len = get_file_size(fd);

	if (info.len <= 0) {
		fprintf(stderr, "Need size of filesystem\n");
		return EXIT_FAILURE;
	}

	if (info.block_size <= 0)
		info.block_size = compute_block_size();

	/* Round down the filesystem length to be a multiple of the block size */
	info.len &= ~((u64)info.block_size - 1);

	if (info.journal_blocks == 0)
		info.journal_blocks = compute_journal_blocks();

	if (info.no_journal == 0)
		info.feat_compat = EXT4_FEATURE_COMPAT_HAS_JOURNAL;
	else
		info.journal_blocks = 0;

	if (info.blocks_per_group <= 0)
		info.blocks_per_group = compute_blocks_per_group();

	if (info.inodes <= 0)
		info.inodes = compute_inodes();

	if (info.inode_size <= 0)
		info.inode_size = 256;

	if (info.label == NULL)
		info.label = "";

	info.inodes_per_group = compute_inodes_per_group();

	info.feat_compat |=
			EXT4_FEATURE_COMPAT_RESIZE_INODE |
			EXT4_FEATURE_COMPAT_EXT_ATTR;

	info.feat_ro_compat |=
			EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER |
			EXT4_FEATURE_RO_COMPAT_LARGE_FILE |
			EXT4_FEATURE_RO_COMPAT_GDT_CSUM;

	info.feat_incompat |=
			EXT4_FEATURE_INCOMPAT_EXTENTS |
			EXT4_FEATURE_INCOMPAT_FILETYPE;


	info.bg_desc_reserve_blocks = compute_bg_desc_reserve_blocks();

	printf("Creating filesystem with parameters:\n");
	printf("    Size: %"PRIu64"\n", info.len);
	printf("    Block size: %d\n", info.block_size);
	printf("    Blocks per group: %d\n", info.blocks_per_group);
	printf("    Inodes per group: %d\n", info.inodes_per_group);
	printf("    Inode size: %d\n", info.inode_size);
	printf("    Journal blocks: %d\n", info.journal_blocks);
	printf("    Label: %s\n", info.label);

	ext4_create_fs_aux_info();

	printf("    Blocks: %"PRIu64"\n", aux_info.len_blocks);
	printf("    Block groups: %d\n", aux_info.groups);
	printf("    Reserved blocks: %"PRIu64"\n",  (aux_info.len_blocks / 100) * info.reserve_pcnt);
	printf("    Reserved block group size: %d\n", info.bg_desc_reserve_blocks);

	ext4_sparse_file = sparse_file_new(info.block_size, info.len);

	block_allocator_init();

	ext4_fill_in_sb();

	if (reserve_inodes(0, 10) == EXT4_ALLOCATE_FAILED)
		error("failed to reserve first 10 inodes");

	if (info.feat_compat & EXT4_FEATURE_COMPAT_HAS_JOURNAL)
		ext4_create_journal_inode();

	if (info.feat_compat & EXT4_FEATURE_COMPAT_RESIZE_INODE)
		ext4_create_resize_inode();

	root_inode_num = build_directory_structure(directory, "", 0,
		fs_config_func, verbose, fixed_time);

	root_mode = S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH;
	inode_set_permissions(root_inode_num, root_mode, 0, 0, 0);

	ext4_update_free();

	ext4_queue_sb();

	if (block_list_file) {
		size_t dirlen = strlen(directory);
		struct block_allocation* p = get_saved_allocation_chain();
		while (p) {
			if (strncmp(p->filename, directory, dirlen) == 0) {
				fprintf(block_list_file, "%s", p->filename + dirlen);
			} else {
				fprintf(block_list_file, "%s", p->filename);
			}
			print_blocks(block_list_file, p);
			struct block_allocation* pn = p->next;
			free_alloc(p);
			p = pn;
		}
	}

	printf("Created filesystem with %d/%d inodes and %d/%d blocks\n",
			aux_info.sb->s_inodes_count - aux_info.sb->s_free_inodes_count,
			aux_info.sb->s_inodes_count,
			aux_info.sb->s_blocks_count_lo - aux_info.sb->s_free_blocks_count_lo,
			aux_info.sb->s_blocks_count_lo);

	if (wipe && WIPE_IS_SUPPORTED) {
		wipe_block_device(fd, info.len);
	}

	write_ext4_image(fd, gzip, sparse, crc);

	sparse_file_destroy(ext4_sparse_file);
	ext4_sparse_file = NULL;

	free(directory);

	return 0;
}
Exemple #3
0
int make_ext4fs(const char *filename, const char *directory,
                char *mountpoint, int android, int gzip, int sparse)
{
        u32 root_inode_num;
        u16 root_mode;

	if (info.len == 0)
		info.len = get_file_size(filename);

	if (info.len <= 0) {
		fprintf(stderr, "Need size of filesystem\n");
                return EXIT_FAILURE;
	}

	if (info.block_size <= 0)
		info.block_size = compute_block_size();

	if (info.journal_blocks == 0)
		info.journal_blocks = compute_journal_blocks();

	if (info.no_journal == 0)
		info.feat_compat = EXT4_FEATURE_COMPAT_HAS_JOURNAL;
	else
		info.journal_blocks = 0;

	if (info.blocks_per_group <= 0)
		info.blocks_per_group = compute_blocks_per_group();

	if (info.inodes <= 0)
		info.inodes = compute_inodes();

	if (info.inode_size <= 0)
		info.inode_size = 256;

	if (info.label == NULL)
		info.label = "";

	info.inodes_per_group = compute_inodes_per_group();

	info.feat_compat |=
			EXT4_FEATURE_COMPAT_RESIZE_INODE;

	info.feat_ro_compat |=
			EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER |
			EXT4_FEATURE_RO_COMPAT_LARGE_FILE;

	info.feat_incompat |=
			EXT4_FEATURE_INCOMPAT_EXTENTS |
			EXT4_FEATURE_INCOMPAT_FILETYPE;


	printf("Creating filesystem with parameters:\n");
	printf("    Size: %llu\n", info.len);
	printf("    Block size: %d\n", info.block_size);
	printf("    Blocks per group: %d\n", info.blocks_per_group);
	printf("    Inodes per group: %d\n", info.inodes_per_group);
	printf("    Inode size: %d\n", info.inode_size);
	printf("    Journal blocks: %d\n", info.journal_blocks);
	printf("    Label: %s\n", info.label);

	ext4_create_fs_aux_info();

	printf("    Blocks: %llu\n", aux_info.len_blocks);
	printf("    Block groups: %d\n", aux_info.groups);
	printf("    Reserved block group size: %d\n", aux_info.bg_desc_reserve_blocks);

	block_allocator_init();

	ext4_fill_in_sb();

	if (reserve_inodes(0, 10) == EXT4_ALLOCATE_FAILED)
		error("failed to reserve first 10 inodes");

	if (info.feat_compat & EXT4_FEATURE_COMPAT_HAS_JOURNAL)
		ext4_create_journal_inode();

	if (info.feat_compat & EXT4_FEATURE_COMPAT_RESIZE_INODE)
		ext4_create_resize_inode();

	if (directory)
		root_inode_num = build_directory_structure(directory, mountpoint, 0, android);
	else
		root_inode_num = build_default_directory_structure();

	root_mode = S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH;
	inode_set_permissions(root_inode_num, root_mode, 0, 0, 0);

	ext4_update_free();

	printf("Created filesystem with %d/%d inodes and %d/%d blocks\n",
			aux_info.sb->s_inodes_count - aux_info.sb->s_free_inodes_count,
			aux_info.sb->s_inodes_count,
			aux_info.sb->s_blocks_count_lo - aux_info.sb->s_free_blocks_count_lo,
			aux_info.sb->s_blocks_count_lo);

	write_ext4_image(filename, gzip, sparse);

	return 0;
}
Exemple #4
0
  /**
   * The main host function called from outside, as part of the API for a single node.
   */
  unsigned int NumericFormFactorC::compute_form_factor(int rank,
//            #ifndef __SSE3__
              real_vec_t &shape_def,
//            #else
//              real_t* shape_def, unsigned int num_triangles,
//            #endif
            complex_t* &ff,
            real_t* &qx, int nqx, real_t* &qy, int nqy, complex_t* &qz, int nqz,
            real_t* &rot,
            real_t& kernel_time, real_t& red_time, real_t& mem_time
            #ifdef FINDBLOCK
              , const int block_x, const int block_y, const int block_z, const int block_t
            #endif
            ) {
    double temp_mem_time = 0.0, total_mem_time = 0.0;
    #ifdef _OPENMP
      if(rank == 0)
        std::cout << "++      Number of OpenMP threads: " << omp_get_max_threads() << std::endl;
    #endif
  
//    #ifndef __SSE3__
      unsigned int num_triangles = shape_def.size() / CPU_T_PROP_SIZE_;
//    #endif
    if(num_triangles < 1) return 0;

//    #ifdef INTEL_SB_AVX
//      unsigned int shape_padding = (32 - (num_triangles & 31)) & 31;
//    #elif defined __SSE3__
//      unsigned int shape_padding = (16 - (num_triangles & 15)) & 15;
//    #endif
  
    //#ifndef FF_NUM_CPU_PADDING
      unsigned long int total_qpoints = nqx * nqy * nqz;
      unsigned long int host_mem_usage = ((unsigned long int) nqx + nqy) * sizeof(real_t) +
                        nqz * sizeof(complex_t);
    //#else
      // padding to 16 bytes
      //const unsigned int PAD_LINE_ = 16;
      //unsigned int pad_x = 0;
      //if(nqx != 1) pad_x = (PAD_LINE_ - (nqx % PAD_LINE_)) % PAD_LINE_;
      //unsigned int pad_y = (PAD_LINE_ - (nqy % PAD_LINE_)) % PAD_LINE_;
      //unsigned int pad_z = (PAD_LINE_ - (nqz % PAD_LINE_)) % PAD_LINE_;
      //unsigned int pnqx = nqx + pad_x, pnqy = nqy + pad_y, pnqz = nqz + pad_z;
      //unsigned long int total_qpoints = pnqx * pnqy * pnqz;
      //unsigned long int host_mem_usage = ((unsigned long int) pnqx + pnqy) * sizeof(real_t) +
      //                  pnqz * sizeof(complex_t);
    //#endif
  
    // allocate memory for the final FF 3D matrix
    ff = new (std::nothrow) complex_t[total_qpoints];  // allocate and initialize to 0
    memset(ff, 0, total_qpoints * sizeof(complex_t));
    if(ff == NULL) {
      std::cerr << "Memory allocation failed for ff. Size = "
            << total_qpoints * sizeof(complex_t) << " b" << std::endl;
      return 0;
    } // if
    host_mem_usage += total_qpoints * sizeof(complex_t);
  
    //unsigned long int matrix_size = (unsigned long int) nqx * nqy * nqz * num_triangles;
    
    // do hyperblocking to use less memory
    unsigned int b_nqx = 0, b_nqy = 0, b_nqz = 0, b_num_triangles = 0;
    #ifndef FF_NUM_CPU_AUTOTUNE_HB
      compute_block_size(nqx, nqy, nqz, num_triangles,
                b_nqx, b_nqy, b_nqz, b_num_triangles
                #ifdef FINDBLOCK
                  , block_x, block_y, block_z, block_t
                #endif
                );
    #else
      std::cout << "-- Autotuning hyperblock size ... " << std::endl;
      double min_time_hb = 1000000.0;
      unsigned int min_b_nqx = 1, min_b_nqy = 1, min_b_nqz = 1, min_b_num_triangles = 1;
      woo::BoostChronoTimer at_kernel_timer, at_overhead_timer;
      at_overhead_timer.start();
      complex_t* ff_temp;
      ff_temp = new (std::nothrow) complex_t[nqx * nqy * nqz];
      for(int b_nqx_i = 1; b_nqx_i <= nqx; ++ b_nqx_i) {
        for(int b_nqy_i = 10; b_nqy_i <= nqy; b_nqy_i += 10) {
          for(int b_nqz_i = 10; b_nqz_i <= nqz; b_nqz_i += 10) {
            for(int b_nt_i = 10; b_nt_i <= num_triangles; b_nt_i += 10) {
              at_kernel_timer.start();

              // compute the number of sub-blocks, along each of the 4 dimensions
              unsigned int nb_x = (unsigned int) ceil((float) nqx / b_nqx_i);
              unsigned int nb_y = (unsigned int) ceil((float) nqy / b_nqy_i);
              unsigned int nb_z = (unsigned int) ceil((float) nqz / b_nqz_i);
              unsigned int nb_t = (unsigned int) ceil((float) num_triangles / b_nt_i);
              unsigned int num_blocks = nb_x * nb_y * nb_z * nb_t;

              form_factor_kernel_fused_nqx1(qx, qy, qz, shape_def,
                  b_nqx_i, b_nqy_i, b_nqz_i, b_nt_i,
                  b_nqx_i, b_nqy_i, b_nqz_i, b_nt_i,
                  nqx, nqy, nqz, num_triangles,
                  0, 0, 0, 0,
                  rot,
                  ff);

              at_kernel_timer.stop();
              double curr_time = at_kernel_timer.elapsed_msec();
              double tot_time = curr_time * num_blocks;
              std::cout << "## " << b_nqx_i << " x " << b_nqy_i << " x " << b_nqz_i
                    << " x " << b_nt_i << "\t" << num_blocks << "\t:\t"
                    << curr_time << "\t" << tot_time << std::endl;
              if(tot_time < min_time_hb) {
                min_time_hb = tot_time;
                min_b_nqx = b_nqx_i; min_b_nqy = b_nqy_i; min_b_nqz = b_nqz_i;
                min_b_num_triangles = b_nt_i;
              } // if
            } // for
          } // for
        } // for
      } // for
      delete[] ff_temp;
      at_overhead_timer.stop();

      b_nqx = min_b_nqx; b_nqy = min_b_nqy; b_nqz = min_b_nqz; b_num_triangles = min_b_num_triangles;
      if(rank == 0) {
        std::cout << "##    HBlock Autotuner overhead: " << at_overhead_timer.elapsed_msec()
              << " ms." << std::endl;
      } // if
    #endif
  
    unsigned long int blocked_3d_matrix_size = (unsigned long int) b_nqx * b_nqy * b_nqz;
    
    //size_t estimated_host_mem_need = host_mem_usage + blocked_matrix_size * sizeof(complex_t);
    //if(rank == 0) {
    //  std::cout << "++    Estimated host memory need: " << (float) estimated_host_mem_need / 1024 / 1024
    //        << " MB" << std::endl;
    //} // if
    #ifndef FF_NUM_CPU_FUSED
      unsigned long int blocked_matrix_size =
                    (unsigned long int) blocked_3d_matrix_size * b_num_triangles;
      host_mem_usage += blocked_matrix_size * sizeof(complex_t);
      complex_t *fq_buffer = new (std::nothrow) complex_t[blocked_matrix_size]();
      if(fq_buffer == NULL) {
        std::cerr << "Memory allocation failed for fq_buffer. blocked_matrix_size = "
              << blocked_matrix_size << std::endl
              << "Host memory usage = " << (float) host_mem_usage / 1024 / 1024 << " MB"
              << std::endl;
        delete[] ff;
        return 0;
      } // if
    #endif
    if(rank == 0) {
      std::cout << "++             Host memory usage: " << (float) host_mem_usage / 1024 / 1024
            << " MB" << std::endl << std::flush;
    } // if

    // compute the number of sub-blocks, along each of the 4 dimensions
    // formulate loops over each dimension, to go over each sub block
    unsigned int nb_x = (unsigned int) ceil((float) nqx / b_nqx);
    unsigned int nb_y = (unsigned int) ceil((float) nqy / b_nqy);
    unsigned int nb_z = (unsigned int) ceil((float) nqz / b_nqz);
    unsigned int nb_t = (unsigned int) ceil((float) num_triangles / b_num_triangles);

    unsigned int curr_b_nqx = b_nqx, curr_b_nqy = b_nqy, curr_b_nqz = b_nqz;
    unsigned int curr_b_num_triangles = b_num_triangles;
    unsigned int num_blocks = nb_x * nb_y * nb_z * nb_t;

    #ifdef TIME_DETAIL_2
      if(rank == 0) {
        std::cout << "++               Hyperblock size: " << b_nqx << " x " << b_nqy
              << " x " << b_nqz << " x " << b_num_triangles << std::endl;
        std::cout << "++  Number of decomposed Hblocks: " << num_blocks
              << " [" << nb_x << " x " << nb_y << " x " << nb_z << " x " << nb_t << "]"
              << std::endl;
      } // if
    #endif // TIME_DETAIL_2

    unsigned int block_num = 0;

    #ifdef PROFILE_PAPI
      long long int papi_total_cycles = 0, papi_total_inst = 0, papi_total_flop = 0;
      double overall_ipc = 0.0;
    #endif

    if(rank == 0) std::cout << "-- Computing form factor on CPU ... " << std::flush;

    woo::BoostChronoTimer kernel_timer;
    kernel_timer.start();

    // compute for each hyperblock
    curr_b_nqx = b_nqx;
    for(unsigned int ib_x = 0; ib_x < nb_x; ++ ib_x) {
      if(ib_x == nb_x - 1) curr_b_nqx = nqx - b_nqx * ib_x;
      curr_b_nqy = b_nqy;
      for(unsigned int ib_y = 0; ib_y < nb_y; ++ ib_y) {
        if(ib_y == nb_y - 1) curr_b_nqy = nqy - b_nqy * ib_y;
        curr_b_nqz = b_nqz;
        for(unsigned int ib_z = 0; ib_z < nb_z; ++ ib_z) {
          if(ib_z == nb_z - 1) curr_b_nqz = nqz - b_nqz * ib_z;
          curr_b_num_triangles = b_num_triangles;
          for(unsigned int ib_t = 0; ib_t < nb_t; ++ ib_t) {
            if(ib_t == nb_t - 1)
              curr_b_num_triangles = num_triangles - b_num_triangles * ib_t;

            #ifdef PROFILE_PAPI
              // PAPI_L1_DCM  0x80000000  No   Level 1 data cache misses
              // PAPI_L1_ICM  0x80000001  No   Level 1 instruction cache misses
              // PAPI_L2_DCM  0x80000002  No   Level 2 data cache misses
              // PAPI_L2_ICM  0x80000003  No   Level 2 instruction cache misses
              // PAPI_L1_TCM  0x80000006  Yes  Level 1 cache misses
              // PAPI_L2_TCM  0x80000007  No   Level 2 cache misses
              // PAPI_FPU_IDL 0x80000012  No   Cycles floating point units are idle
              // PAPI_TLB_DM  0x80000014  No   Data translation lookaside buffer misses
              // PAPI_TLB_IM  0x80000015  No   Instruction translation lookaside buffer misses
              // PAPI_TLB_TL  0x80000016  Yes  Total translation lookaside buffer misses
              // PAPI_STL_ICY 0x80000025  No   Cycles with no instruction issue
              // PAPI_HW_INT  0x80000029  No   Hardware interrupts
              // PAPI_BR_TKN  0x8000002c  No   Conditional branch instructions taken
              // PAPI_BR_MSP  0x8000002e  No   Conditional branch instructions mispredicted
              // PAPI_TOT_INS 0x80000032  No   Instructions completed
              // PAPI_FP_INS  0x80000034  No   Floating point instructions
              // PAPI_BR_INS  0x80000037  No   Branch instructions
              // PAPI_VEC_INS 0x80000038  No   Vector/SIMD instructions (could include integer)
              // PAPI_RES_STL 0x80000039  No   Cycles stalled on any resource
              // PAPI_TOT_CYC 0x8000003b  No   Total cycles
              // PAPI_L1_DCH  0x8000003e  Yes  Level 1 data cache hits
              // PAPI_L2_DCH  0x8000003f  Yes  Level 2 data cache hits
              // PAPI_L1_DCA  0x80000040  No   Level 1 data cache accesses
              // PAPI_L2_DCA  0x80000041  No   Level 2 data cache accesses
              // PAPI_L1_ICH  0x80000049  Yes  Level 1 instruction cache hits
              // PAPI_L2_ICH  0x8000004a  No   Level 2 instruction cache hits
              // PAPI_L1_ICA  0x8000004c  No   Level 1 instruction cache accesses
              // PAPI_L2_ICA  0x8000004d  No   Level 2 instruction cache accesses
              // PAPI_L1_ICR  0x8000004f  No   Level 1 instruction cache reads
              // PAPI_L1_TCH  0x80000055  Yes  Level 1 total cache hits
              // PAPI_L2_TCH  0x80000056  Yes  Level 2 total cache hits
              // PAPI_L1_TCA  0x80000058  Yes  Level 1 total cache accesses
              // PAPI_L2_TCA  0x80000059  No   Level 2 total cache accesses
              // PAPI_FML_INS 0x80000061  No   Floating point multiply instructions
              // PAPI_FAD_INS 0x80000062  No   Floating point add instructions
              //                               (Also includes subtract instructions)
              // PAPI_FDV_INS 0x80000063  No   Floating point divide instructions
              //                               (Counts both divide and square root instructions)
              // PAPI_FSQ_INS 0x80000064  No   Floating point square root instructions
              //                               (Counts both divide and square root instructions)
              // PAPI_FP_OPS  0x80000066  No   Floating point operations
              // PAPI_SP_OPS  0x80000067  No   Floating point operations; optimized to count
              //                               scaled single precision vector operations
              // PAPI_DP_OPS  0x80000068  No   Floating point operations; optimized to count
              //                               scaled double precision vector operations

              int papi_events[3] = { PAPI_TOT_CYC, PAPI_TOT_INS, PAPI_FP_OPS };
              //int papi_events[3] = { PAPI_FML_INS, PAPI_FAD_INS, PAPI_FDV_INS };
              //int papi_events[3] = { PAPI_FP_OPS, PAPI_SP_OPS, PAPI_DP_OPS };
              long long  papi_counter_values[3];
              PAPI_start_counters(papi_events, 3);
            #endif

            // call the main kernel
            #ifndef FF_NUM_CPU_FUSED // DO NOT USE THIS
              form_factor_kernel(qx, qy, qz, shape_def,
                  curr_b_nqx, curr_b_nqy, curr_b_nqz, curr_b_num_triangles,
                  b_nqx, b_nqy, b_nqz, b_num_triangles,
                  ib_x, ib_y, ib_z, ib_t,
                  fq_buffer);
            #else
              if(nqx == 1) {
                form_factor_kernel_fused_nqx1(qx, qy, qz, shape_def,
                //form_factor_kernel_fused_nqx1_unroll4(qx, qy, qz, shape_def,
                    curr_b_nqx, curr_b_nqy, curr_b_nqz, curr_b_num_triangles,
                    b_nqx, b_nqy, b_nqz, b_num_triangles,
                    nqx, nqy, nqz, num_triangles,
                    ib_x, ib_y, ib_z, ib_t,
                    rot,
                    ff);
              } else {
//                #ifdef __SSE3__
//                  if(rank == 0)
//                    std::cout << "uh-oh: no SSE3 version!" << std::endl;
//                #else
                  form_factor_kernel_fused_unroll4(qx, qy, qz, shape_def,
                    curr_b_nqx, curr_b_nqy, curr_b_nqz, curr_b_num_triangles,
                    b_nqx, b_nqy, b_nqz, b_num_triangles,
                    nqx, nqy, nqz, num_triangles,
                    ib_x, ib_y, ib_z, ib_t,
                    rot,
                    ff);
//                #endif // __SSE3__
              } // if-else
            #endif

            #ifndef FF_NUM_CPU_FUSED // DO NOT USE THIS
              // call the reduction kernel
              reduction_kernel(curr_b_nqx, curr_b_nqy, curr_b_nqz,
                  curr_b_num_triangles, blocked_matrix_size,
                  b_nqx, b_nqy, b_nqz, num_triangles,
                  nqx, nqy, nqz,
                  ib_x, ib_y, ib_z, ib_t,
                  fq_buffer, ff);
            #endif

            #ifdef PROFILE_PAPI
              PAPI_stop_counters(papi_counter_values, 3);
              papi_total_cycles += papi_counter_values[0];
              papi_total_inst += papi_counter_values[1];
              papi_total_flop += papi_counter_values[2];
            #endif
          } // for ib_t
        } // for ib_z
      } // for ib_y
    } // for ib_x

    kernel_timer.stop();
    kernel_time = kernel_timer.elapsed_msec();

    #ifndef FF_NUM_CPU_FUSED
      delete[] fq_buffer;
    #endif

    if(rank == 0) std::cout << "done." << std::endl;

    #ifdef PROFILE_PAPI
      if(rank == 0) {
        std::cout << "++                  PAPI_TOT_CYC: " << papi_total_cycles << std::endl;
        std::cout << "++                  PAPI_TOT_INS: " << papi_total_inst << std::endl;
        std::cout << "++                   PAPI_FP_OPS: " << papi_total_flop << std::endl;
        std::cout << "++                           IPC: "
              << (double) papi_total_inst / papi_total_cycles << std::endl;
      } // if
    #endif

    return num_triangles;
  } // NumericFormFactorC::compute_form_factor()
int make_ext4fs_internal(int fd, const char *_directory, const char *_target_out_directory,
						 const char *_mountpoint, fs_config_func_t fs_config_func, int gzip,
						 int sparse, int crc, int wipe, int real_uuid,
						 struct selabel_handle *sehnd, int verbose, time_t fixed_time,
						 FILE* block_list_file)
{
	u32 root_inode_num;
	u16 root_mode;
	char *mountpoint;
	char *directory = NULL;
	char *target_out_directory = NULL;

	if (setjmp(setjmp_env))
		return EXIT_FAILURE; /* Handle a call to longjmp() */

	info.block_device = is_block_device_fd(fd);

	if (info.block_device && (sparse || gzip || crc)) {
		fprintf(stderr, "No sparse/gzip/crc allowed for block device\n");
		return EXIT_FAILURE;
	}

	if (_mountpoint == NULL) {
		mountpoint = strdup("");
	} else {
		mountpoint = canonicalize_abs_slashes(_mountpoint);
	}

	if (_directory) {
		directory = canonicalize_rel_slashes(_directory);
	}

	if (_target_out_directory) {
		target_out_directory = canonicalize_rel_slashes(_target_out_directory);
	}

	if (info.len <= 0)
		info.len = get_file_size(fd);

	if (info.len <= 0) {
		fprintf(stderr, "Need size of filesystem\n");
		return EXIT_FAILURE;
	}

	if (info.block_size <= 0)
		info.block_size = compute_block_size();

	/* Round down the filesystem length to be a multiple of the block size */
	info.len &= ~((u64)info.block_size - 1);

	if (info.journal_blocks == 0)
		info.journal_blocks = compute_journal_blocks();

	if (info.no_journal == 0)
		info.feat_compat = EXT4_FEATURE_COMPAT_HAS_JOURNAL;
	else
		info.journal_blocks = 0;

	if (info.blocks_per_group <= 0)
		info.blocks_per_group = compute_blocks_per_group();

	if (info.inodes <= 0)
		info.inodes = compute_inodes();

	if (info.inode_size <= 0)
		info.inode_size = 256;

	if (info.label == NULL)
		info.label = "";

	info.inodes_per_group = compute_inodes_per_group();

	info.feat_compat |=
			EXT4_FEATURE_COMPAT_RESIZE_INODE |
			EXT4_FEATURE_COMPAT_EXT_ATTR;

	info.feat_ro_compat |=
			EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER |
			EXT4_FEATURE_RO_COMPAT_LARGE_FILE |
			EXT4_FEATURE_RO_COMPAT_GDT_CSUM;

	info.feat_incompat |=
			EXT4_FEATURE_INCOMPAT_EXTENTS |
			EXT4_FEATURE_INCOMPAT_FILETYPE;


	info.bg_desc_reserve_blocks = compute_bg_desc_reserve_blocks();

	printf("Creating filesystem with parameters:\n");
	printf("    Size: %"PRIu64"\n", info.len);
	printf("    Block size: %d\n", info.block_size);
	printf("    Blocks per group: %d\n", info.blocks_per_group);
	printf("    Inodes per group: %d\n", info.inodes_per_group);
	printf("    Inode size: %d\n", info.inode_size);
	printf("    Journal blocks: %d\n", info.journal_blocks);
	printf("    Label: %s\n", info.label);

	ext4_create_fs_aux_info();

	printf("    Blocks: %"PRIu64"\n", aux_info.len_blocks);
	printf("    Block groups: %d\n", aux_info.groups);
	printf("    Reserved block group size: %d\n", info.bg_desc_reserve_blocks);

	ext4_sparse_file = sparse_file_new(info.block_size, info.len);

	block_allocator_init();

	ext4_fill_in_sb(real_uuid);

	if (reserve_inodes(0, 10) == EXT4_ALLOCATE_FAILED)
		error("failed to reserve first 10 inodes");

	if (info.feat_compat & EXT4_FEATURE_COMPAT_HAS_JOURNAL)
		ext4_create_journal_inode();

	if (info.feat_compat & EXT4_FEATURE_COMPAT_RESIZE_INODE)
		ext4_create_resize_inode();

#ifdef USE_MINGW
	// Windows needs only 'create an empty fs image' functionality
	assert(!directory);
	root_inode_num = build_default_directory_structure(mountpoint, sehnd);
#else
	if (directory)
		root_inode_num = build_directory_structure(directory, mountpoint, target_out_directory, 0,
			fs_config_func, sehnd, verbose, fixed_time);
	else
		root_inode_num = build_default_directory_structure(mountpoint, sehnd);
#endif

	root_mode = S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH;
	inode_set_permissions(root_inode_num, root_mode, 0, 0, 0);

#ifndef USE_MINGW
	if (sehnd) {
		char *secontext = NULL;

		if (selabel_lookup(sehnd, &secontext, mountpoint, S_IFDIR) < 0) {
			error("cannot lookup security context for %s", mountpoint);
		}
		if (secontext) {
			if (verbose) {
				printf("Labeling %s as %s\n", mountpoint, secontext);
			}
			inode_set_selinux(root_inode_num, secontext);
		}
		freecon(secontext);
	}
#endif

	ext4_update_free();

	if (block_list_file) {
		size_t dirlen = directory ? strlen(directory) : 0;
		struct block_allocation* p = get_saved_allocation_chain();
		while (p) {
			if (directory && strncmp(p->filename, directory, dirlen) == 0) {
				// substitute mountpoint for the leading directory in the filename, in the output file
				fprintf(block_list_file, "%s%s", mountpoint, p->filename + dirlen);
			} else {
				fprintf(block_list_file, "%s", p->filename);
			}
			print_blocks(block_list_file, p);
			struct block_allocation* pn = p->next;
			free_alloc(p);
			p = pn;
		}
	}

	printf("Created filesystem with %d/%d inodes and %d/%d blocks\n",
			aux_info.sb->s_inodes_count - aux_info.sb->s_free_inodes_count,
			aux_info.sb->s_inodes_count,
			aux_info.sb->s_blocks_count_lo - aux_info.sb->s_free_blocks_count_lo,
			aux_info.sb->s_blocks_count_lo);

	if (wipe && WIPE_IS_SUPPORTED) {
		wipe_block_device(fd, info.len);
	}

	write_ext4_image(fd, gzip, sparse, crc);

	sparse_file_destroy(ext4_sparse_file);
	ext4_sparse_file = NULL;

	free(mountpoint);
	free(directory);

	return 0;
}