예제 #1
0
void SetPrint(int argcIn, const char* argvIn[]) {

	if( argcIn == 1 || (argcIn == 2 && argvIn[1][0] == 'm') )
	{
		cout << endl;
		PrintSteps(true);
		PrintSummary(true);
	}
	else if( argcIn > 1 && argcIn < 4 )
	{
		if( argvIn[1][0] == '-' && (argvIn[1][1] == 'n' || argvIn[1][9] == 'n') )
		{
			PrintSteps(false);
			PrintSummary(false);
		}
		else if( argvIn[1][0] == '-' && (argvIn[1][1] == 's' || argvIn[1][9] == 's') )
		{
			PrintSteps(true);
			PrintSummary(false);
		}
		else if( argvIn[1][0] == '-' && (argvIn[1][1] == 'a' || argvIn[1][9] == 'a') )
		{
			PrintSteps(false);
			PrintSummary(true);
		}
		cout << endl;
	}
	else
	{
		PrintSteps(false);
		PrintSummary(false);
	}
}
예제 #2
0
파일: pr_15_1.c 프로젝트: qnu/mdoch
void SingleEvent ()
{
  real vvSum;
  real sp;
  int n;

  NextEvent ();
  if (evIdB < MOL_LIMIT) {
    ProcessCollision ();
    ++ collCount;
  } else if (evIdB < MOL_LIMIT + NDIM * 2 || evIdB >= MOL_LIMIT + 100) {
    ProcessCellCrossing ();
    ++ crossCount;
  } else if (evIdB == MOL_LIMIT + 6) {
    UpdateSystem ();
    nextSumTime += intervalSum;
    ScheduleEvent (0, MOL_LIMIT + 6, nextSumTime);
    VZero (vSum);
    vvSum = 0.;
    sp = 0.;
    DO_MOL {
      VVAdd (vSum, mol[n].rv);
      vvSum += VLenSq (mol[n].rv);
      sp += VDot (mol[n].r, gravField);
    }
    kinEnVal = vvSum * 0.5 / nMol;
    totEnVal = kinEnVal - sp / nMol;
    PrintSummary (stdout);
  } else if (evIdB == MOL_LIMIT + 7) {
예제 #3
0
파일: pr_08_1.c 프로젝트: qnu/mdoch
void SingleStep ()
{
  ++ stepCount;
  timeNow = stepCount * deltaT;
  PredictorStep ();
  PredictorStepQ ();
  GenSiteCoords ();
  ComputeSiteForces ();
  ComputeTorqs ();
  ComputeAccelsQ ();
  ApplyThermostat ();
  CorrectorStep ();
  CorrectorStepQ ();
  AdjustQuat ();
  ApplyBoundaryCond ();
  EvalProps ();
  if (stepCount % stepAdjustTemp == 0) AdjustTemp ();
  AccumProps (1);
  if (stepCount % stepAvg == 0) {
    AccumProps (2);
    PrintSummary (stdout);
    AccumProps (0);
  }
  if (stepCount >= stepEquil && (stepCount - stepEquil) % stepRdf == 0)
     EvalRdf ();
}
예제 #4
0
int myGetState(void * hndl, UserState * ustate, unsigned int privdata)
{
    static int iter=0;

    log_verbose("Reached myGetState with privdata %x\n", privdata);

    /* Same state is being returned for both engines */

    ustate->LinkState = LINK_UP;
    ustate->Errors = 0;
    ustate->MinPktSize = RawMinPktSize;
    ustate->MaxPktSize = RawMaxPktSize;
    ustate->TestMode = RawTestMode;
    if(privdata == 0x54545454)
        ustate->Buffers = TxBufs.TotalNum;
    else
        ustate->Buffers = RxBufs.TotalNum;

    if(iter++ >= 4)
    {
        PrintSummary();

        iter = 0;
    }

    return 0;
}
예제 #5
0
파일: entry.c 프로젝트: mydecember/job
int main (int argc, char **argv) 
{
	int i, nocase = 0;
	FILE *fd;
	char filename[20];
	ACSM_STRUCT * acsm;

	if (argc < 3)
	{
		fprintf (stderr,"Usage: acsmx filename pattern1 pattern2 ...  -nocase\n");
		exit (0);
	}

	acsm = acsmNew ();

	strcpy (filename, argv[1]);
	fd = fopen(filename,"r");
	if(fd == NULL)
	{
		fprintf(stderr,"Open file error!\n");
		exit(1);
	}

	for (i = 1; i < argc; i++)
		if (strcmp (argv[i], "-nocase") == 0)
			nocase = 1;
	for (i = 2; i < argc; i++)
	{
		if (argv[i][0] == '-')
			continue;
		printf("%s,%d\n",argv[i],strlen (argv[i]));
		acsmAddPattern (acsm, argv[i], strlen (argv[i]), nocase,1);
	}
fgets(text,MAXLEN,fd);
	/* Generate GtoTo Table and Fail Table */
	acsmCompile (acsm);
printf("--------------------------------\n");
	NS_TIME(time);
NS_TIME_START(time);
	/*Search Pattern*/
	//while ( fgets(text,MAXLEN,fd) )
	//{
		acsmSearch (acsm, text, strlen (text), PrintMatch);
	//	nline++;
	//}
NS_TIME_END(time);

	PrintSummary(acsm->acsmPatterns);
int a[10]={45,45,45,4,1};
#ifdef __HAVE__LOAD__
printf("-------%d\n", getSummary (acsm->acsmPatterns,a));
#endif

	acsmFree (acsm);

	printf ("\n### AC Match Finished ###\n");
//	system("pause");

	return (0);
}
예제 #6
0
파일: pr_14_2.c 프로젝트: mfer/simula
void SingleEvent ()
{
  real vvSum;
  int n;

  NextEvent ();
  if (evIdB < MOL_LIMIT) {
    ProcessCollision ();
    EvalFreePath ();
    ++ collCount;
  } else if (evIdB >= MOL_LIMIT + 100) {
    ProcessCellCrossing ();
    ++ crossCount;
  } else if (evIdB == MOL_LIMIT + 6) {
    UpdateSystem ();
    nextSumTime += intervalSum;
    ScheduleEvent (0, MOL_LIMIT + 6, nextSumTime);
    VZero (vSum);
    vvSum = 0.;
    DO_MOL {
      VVAdd (vSum, mol[n].rv);
      vvSum += VLenSq (mol[n].rv);
    }
    kinEnVal = vvSum * 0.5 / nMol;
    PrintSummary (stdout);
  }
예제 #7
0
void PrintQueryResult(int ego, char **caption, char **result, int row, int col) {
	if (ego)
		VPrint(caption, result, row, col);
	else
		HPrint(caption, result, row, col);
	PrintSummary(row);
}
예제 #8
0
파일: pr_13_1.c 프로젝트: qnu/mdoch
void SingleStep ()
{
  ++ stepCount;
  timeNow = stepCount * deltaT;
  PredictorStep ();
  PredictorStepS ();
  ComputeForces ();
  ComputeForcesDipoleR ();
  ComputeForcesDipoleF ();
  ComputeDipoleAccel ();
  ApplyThermostat ();
  CorrectorStep ();
  CorrectorStepS ();
  AdjustDipole ();
  ApplyBoundaryCond ();
  EvalProps ();
  if (stepCount % stepAdjustTemp == 0) AdjustTemp ();
  AccumProps (1);
  if (stepCount % stepAvg == 0) {
    AccumProps (2);
    PrintSummary (stdout);
    AccumProps (0);
  }
  if (stepCount >= stepEquil && (stepCount - stepEquil) % stepRdf == 0)
     EvalRdf ();
}
예제 #9
0
static void __exit rawdata_cleanup(void)
{
    int i;

    /* Stop the polling routine */
    del_timer_sync(&poll_timer);
    //DriverState = CLOSED;

    /* Stop any running tests, else the hardware's packet checker &
     * generator will continue to run.
     */
    XIo_Out32(TXbarbase+TX_CONFIG_ADDRESS, 0);

    XIo_Out32(TXbarbase+RX_CONFIG_ADDRESS, 0);

    printk(KERN_INFO "%s: Unregistering Xilinx driver from kernel.\n", MYNAME);
    if (TxBufCnt != RxBufCnt)
    {
        printk("%s: Buffers Transmitted %u Received %u\n", MYNAME, TxBufCnt, RxBufCnt);
        printk("TxSeqNo = %u, RxSeqNo = %u\n", TxSeqNo, RxSeqNo);
        mdelay(1);
    }
#ifdef FIFO_EMPTY_CHECK
    DmaFifoEmptyWait(MYHANDLE,DIR_TYPE_S2C);
    // wait for appropriate time to stabalize
    mdelay(STABILITY_WAIT_TIME);
#endif

    DmaUnregister(handle[0]);

#ifdef FIFO_EMPTY_CHECK
    DmaFifoEmptyWait(MYHANDLE,DIR_TYPE_C2S);
    // wait for appropriate time to stabalize
    mdelay(STABILITY_WAIT_TIME);
#endif
    DmaUnregister(handle[2]);

    PrintSummary();

    mdelay(1000);

    /* Not sure if free_page() sleeps or not. */
    spin_lock_bh(&RawLock);
    printk("Freeing user buffers\n");
    for(i=0; i<TxBufs.TotalNum; i++)
        //kfree(TxBufs.origVA[i]);
        free_page((unsigned long)(TxBufs.origVA[i]));
    for(i=0; i<RxBufs.TotalNum; i++)
        //kfree(RxBufs.origVA[i]);
        free_page((unsigned long)(RxBufs.origVA[i]));
    spin_unlock_bh(&RawLock);
}
예제 #10
0
void MCommandProfiler::Analysis()
{
	FILE* fp = fopen("CmdProfile.txt", "wt");
	if (fp == 0) return;

	PrintTitle(fp);
	PrintSummary(fp);
	PrintCmdItems(fp);

	fclose(fp);

	Reset();
}
예제 #11
0
파일: pr_02_1.c 프로젝트: juannnesss/MC
int main (int argc, char **argv)
{
    GetNameList (argc, argv);
    //PrintNameList (stdout);
    SetParams ();
    SetupJob ();
    moreCycles = 1;
    PrintSummary (stdout);
    while (moreCycles) {
        SingleStep ();
        if (stepCount >= stepLimit) moreCycles = 0;
    }
}
예제 #12
0
파일: entry.cpp 프로젝트: tuyu/alo_ac
int main(int argc, char **argv)
{
	int i, nocase = 0;
	FILE *fd;
	char filename[20];
	ACSM_STRUCT * acsm;

//	if (argc < 3)
//	{
//		fprintf(stderr, "Usage: acsmx filename pattern1 pattern2 ...  -nocase\n");
//		exit(0);
//	}

	acsm = acsmNew();

	strcpy_s(filename, "test.txt");
	fd = fopen(filename, "r");
	if (fd == NULL)
	{
		fprintf(stderr, "Open file error!\n");
		exit(1);
	}

	for (i = 1; i < argc; i++)
	if (strcmp(argv[i], "-nocase") == 0)
		nocase = 1;
	
	char a[] = { "test" };
	acsmAddPattern(acsm, (unsigned char *)a, strlen(a), nocase);

	/* Generate GtoTo Table and Fail Table */
	acsmCompile(acsm);

	/*Search Pattern*/
	while (fgets(( char*)text, MAXLEN, fd))
	{
		acsmSearch(acsm, text, strlen((char*)text), PrintMatch);
		nline++;
	}

	PrintSummary(acsm->acsmPatterns);

	acsmFree(acsm);

	printf("\n### AC Match Finished ###\n");
	system("pause");

	return (0);
}
예제 #13
0
파일: pr_02_1.c 프로젝트: juannnesss/MC
void SingleStep ()
{
    ++ stepCount;
    timeNow = stepCount * deltaT;
    LeapfrogStep (1);
    ApplyBoundaryCond ();
    ComputeForces ();
    LeapfrogStep (2);
    //EvalProps ();
    //AccumProps (1);
    if (stepCount % stepAvg == 0) {
        //AccumProps (2);
        PrintSummary (stdout);
        //PrintVels (stdout);
        //AccumProps (0);
    }
}
예제 #14
0
static void __exit rawdata_cleanup(void)
{
    int i;

    /* Stop the polling routine */
    del_timer_sync(&poll_timer);
    //DriverState = CLOSED;

    /* Stop any running tests, else the hardware's packet checker &
     * generator will continue to run.
     */

    log_verbose("TXbarbase = %p\n", TXbarbase);

    XIo_Out32(TXbarbase+TX_CONFIG_ADDRESS, 0);

#ifndef XAUI
    XIo_Out32(TXbarbase+RX_CONFIG_ADDRESS, 0);
#endif

    printk(KERN_INFO "%s: Unregistering Xilinx driver from kernel.\n", MYNAME);
    if (TxBufCnt != RxBufCnt)
    {
        printk("%s: Buffers Transmitted %u Received %u\n", MYNAME, TxBufCnt, RxBufCnt);
        printk("TxSeqNo = %u, RxSeqNo = %u\n", TxSeqNo, RxSeqNo);
        mdelay(1);
    }
    DmaUnregister(handle[0]);
    DmaUnregister(handle[2]);

    PrintSummary();
    
    mdelay(1000);

    /* Not sure if free_page() sleeps or not. */
    spin_lock_bh(&RawLock);
    printk("Freeing user buffers\n");
    for(i=0; i<TxBufs.TotalNum; i++)
        //kfree(TxBufs.origVA[i]);
        free_page((unsigned long)(TxBufs.origVA[i]));
    for(i=0; i<RxBufs.TotalNum; i++)
        //kfree(RxBufs.origVA[i]);
        free_page((unsigned long)(RxBufs.origVA[i]));
    spin_unlock_bh(&RawLock);
}
예제 #15
0
int main(int argc, char* argv[])
{
	int attrib_ct, line_ct;
	int k; //loop counters
	ListP *Attributes; 

	//open files
	getFileHandles(argc, argv); 
	//get attributes and line counters
	attrib_ct = readValue();
	line_ct = readValue();
	getc(inF); //eat newline
	//Make array of attributes
	Attributes = makeArrayofList(attrib_ct);

	//read file
	for (k = 0; k < line_ct; k++)
		readAttributes(Attributes, attrib_ct);
	PrintSummary(Attributes, attrib_ct);
}
예제 #16
0
void SingleStep ()
{
  ++ stepCount;
  timeNow = stepCount * deltaT;
  LeapfrogStep (1);
  GenSiteCoords ();
  ComputeSiteForces ();
  ComputeTorqs ();
  ApplyThermostat ();
  LeapfrogStep (2);
  ApplyBoundaryCond ();
  EvalProps ();
  if (stepCount % stepAdjustTemp == 0 || stepCount < stepEquil &&
     stepCount % 100 == 0) AdjustTemp ();
  AccumProps (1);
  if (stepCount % stepAvg == 0) {
    AccumProps (2);
    PrintSummary (stdout);
    AccumProps (0);
  }
}
예제 #17
0
STATUS LNPUBLIC ReadSummaryData
            (VOID far *optional_param,
            SEARCH_MATCH far *search_info,
            ITEM_TABLE far *summary_info)
{
    SEARCH_MATCH  SearchMatch;
    STATUS        error;

    memcpy ((char*)(&SearchMatch), (char *)search_info, sizeof(SEARCH_MATCH));

    if (!(SearchMatch.SERetFlags & SE_FMATCH))
        return (NOERROR);

    /* Print the note ID. */

    printf ("\nNote ID is: %#010lx.\n", SearchMatch.ID.NoteID);

    /* Print the summary data. */

    if (error = PrintSummary( (char*)summary_info ))
        return (error);

    return (NOERROR);
}
예제 #18
0
파일: Sync.c 프로젝트: mohamso/icpads14
////////////////////////////////////////////////////////////////////////////////
// Program Main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char *argv[])
{
	int Nx, Ny, Nz, max_iters;
	int blockX, blockY, blockZ;

	if (argc == 8) {
		Nx = atoi(argv[1]);
		Ny = atoi(argv[2]);
		Nz = atoi(argv[3]);
		max_iters = atoi(argv[4]);
		blockX = atoi(argv[5]);
		blockY = atoi(argv[6]);
		blockZ = atoi(argv[7]);
	}
	else
	{
		printf("Usage: %s nx ny nz i block_x block_y block_z number_of_threads\n", 
			argv[0]);
		exit(1);
	}

	// Get the number of GPUS
	int number_of_devices;
	checkCuda(cudaGetDeviceCount(&number_of_devices));
  
  if (number_of_devices < 2) {
  	printf("Less than two devices were found.\n");
  	printf("Exiting...\n");

  	return -1;
  }

	// Decompose along the Z-axis
	int _Nz = Nz/number_of_devices;

	// Define constants
	const _DOUBLE_ L = 1.0;
	const _DOUBLE_ h = L/(Nx+1);
	const _DOUBLE_ dt = h*h/6.0;
	const _DOUBLE_ beta = dt/(h*h);
	const _DOUBLE_ c0 = beta;
	const _DOUBLE_ c1 = (1-6*beta);

	// Check if ECC is turned on
	ECCCheck(number_of_devices);

	// Set the number of OpenMP threads
	omp_set_num_threads(number_of_devices);

	#pragma omp parallel
	{
		unsigned int tid = omp_get_num_threads();

		#pragma omp single
		{
			printf("Number of OpenMP threads: %d\n", tid);
		}
	}

  // CPU memory operations
  int dt_size = sizeof(_DOUBLE_);

	_DOUBLE_ *u_new, *u_old;

	u_new = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(Nz+2));
	u_old = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(Nz+2));

	init(u_old, u_new, h, Nx, Ny, Nz);

	// Allocate and generate arrays on the host
	size_t pitch_bytes;
	size_t pitch_gc_bytes;

	_DOUBLE_ *h_Unew, *h_Uold;
	_DOUBLE_ *h_s_Uolds[number_of_devices], *h_s_Unews[number_of_devices];
	_DOUBLE_ *left_send_buffer[number_of_devices], *left_receive_buffer[number_of_devices];
	_DOUBLE_ *right_send_buffer[number_of_devices], *right_receive_buffer[number_of_devices];

	h_Unew = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(Nz+2));
	h_Uold = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(Nz+2));

	init(h_Uold, h_Unew, h, Nx, Ny, Nz);

	#pragma omp parallel
	{
		unsigned int tid = omp_get_thread_num();

		h_s_Unews[tid] = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_Nz+2));
		h_s_Uolds[tid] = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_Nz+2));

		right_send_buffer[tid] = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_GC_DEPTH));
		left_send_buffer[tid] = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_GC_DEPTH));
		right_receive_buffer[tid] = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_GC_DEPTH));
		left_receive_buffer[tid] = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_GC_DEPTH));

		checkCuda(cudaHostAlloc((void**)&h_s_Unews[tid], dt_size*(Nx+2)*(Ny+2)*(_Nz+2), cudaHostAllocPortable));
		checkCuda(cudaHostAlloc((void**)&h_s_Uolds[tid], dt_size*(Nx+2)*(Ny+2)*(_Nz+2), cudaHostAllocPortable));
		checkCuda(cudaHostAlloc((void**)&right_send_buffer[tid], dt_size*(Nx+2)*(Ny+2)*(_GC_DEPTH), cudaHostAllocPortable));
		checkCuda(cudaHostAlloc((void**)&left_send_buffer[tid], dt_size*(Nx+2)*(Ny+2)*(_GC_DEPTH), cudaHostAllocPortable));
		checkCuda(cudaHostAlloc((void**)&right_receive_buffer[tid], dt_size*(Nx+2)*(Ny+2)*(_GC_DEPTH), cudaHostAllocPortable));
		checkCuda(cudaHostAlloc((void**)&left_receive_buffer[tid], dt_size*(Nx+2)*(Ny+2)*(_GC_DEPTH), cudaHostAllocPortable));

		init_subdomain(h_s_Uolds[tid], h_Uold, Nx, Ny, _Nz, tid);
	}

	// GPU memory operations
	_DOUBLE_ *d_s_Unews[number_of_devices], *d_s_Uolds[number_of_devices];
	_DOUBLE_ *d_right_send_buffer[number_of_devices], *d_left_send_buffer[number_of_devices];
	_DOUBLE_ *d_right_receive_buffer[number_of_devices], *d_left_receive_buffer[number_of_devices];

	#pragma omp parallel
	{
		unsigned int tid = omp_get_thread_num();

		checkCuda(cudaSetDevice(tid));

		CopyToConstantMemory(c0, c1);

		checkCuda(cudaMallocPitch((void**)&d_s_Uolds[tid], &pitch_bytes, dt_size*(Nx+2), (Ny+2)*(_Nz+2)));
		checkCuda(cudaMallocPitch((void**)&d_s_Unews[tid], &pitch_bytes, dt_size*(Nx+2), (Ny+2)*(_Nz+2)));
		checkCuda(cudaMallocPitch((void**)&d_left_receive_buffer[tid], &pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH)));
		checkCuda(cudaMallocPitch((void**)&d_right_receive_buffer[tid], &pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH)));
		checkCuda(cudaMallocPitch((void**)&d_left_send_buffer[tid], &pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH)));
		checkCuda(cudaMallocPitch((void**)&d_right_send_buffer[tid], &pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH)));
	}

	// Copy data from host to the device
	double HtD_timer = 0.;
	HtD_timer -= omp_get_wtime();
	#pragma omp parallel
	{
		unsigned int tid = omp_get_thread_num();
		checkCuda(cudaSetDevice(tid));
		checkCuda(cudaMemcpy2D(d_s_Uolds[tid], pitch_bytes, h_s_Uolds[tid], dt_size*(Nx+2), dt_size*(Nx+2), ((Ny+2)*(_Nz+2)), cudaMemcpyDefault));
		checkCuda(cudaMemcpy2D(d_s_Unews[tid], pitch_bytes, h_s_Unews[tid], dt_size*(Nx+2), dt_size*(Nx+2), ((Ny+2)*(_Nz+2)), cudaMemcpyDefault));
	}
	HtD_timer += omp_get_wtime();

	int pitch = pitch_bytes/dt_size;
	int gc_pitch = pitch_gc_bytes/dt_size;

    // GPU kernel launch parameters
	dim3 threads_per_block(blockX, blockY, blockZ);
	unsigned int blocksInX = getBlock(Nx, blockX);
	unsigned int blocksInY = getBlock(Ny, blockY);
	unsigned int blocksInZ = getBlock(_Nz-2, k_loop);
	dim3 thread_blocks(blocksInX, blocksInY, blocksInZ);
	dim3 thread_blocks_halo(blocksInX, blocksInY);

	double compute_timer = 0.;
  compute_timer -= omp_get_wtime();

	#pragma omp parallel
	{
		unsigned int tid = omp_get_thread_num();

		for(int iterations = 0; iterations < max_iters; iterations++)
		{
			// Compute inner nodes
			checkCuda(cudaSetDevice(tid));
			ComputeInnerPoints(thread_blocks, threads_per_block, d_s_Unews[tid], d_s_Uolds[tid], pitch, Nx, Ny, _Nz);

			// Copy right boundary data to host
			if (tid == 0)
			{
				checkCuda(cudaSetDevice(tid));
				CopyBoundaryRegionToGhostCell(thread_blocks_halo, threads_per_block, d_s_Unews[tid], d_right_send_buffer[tid], Nx, Ny, _Nz, pitch, gc_pitch, 0);
				checkCuda(cudaMemcpy2D(right_send_buffer[tid], dt_size*(Nx+2), d_right_send_buffer[tid], pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH), cudaMemcpyDefault));
			}
			// Copy left boundary data to host
			if (tid == 1)
			{
				checkCuda(cudaSetDevice(tid));
				CopyBoundaryRegionToGhostCell(thread_blocks_halo, threads_per_block, d_s_Unews[tid], d_left_send_buffer[tid], Nx, Ny, _Nz, pitch, gc_pitch, 1);
				checkCuda(cudaMemcpy2D(left_send_buffer[tid], dt_size*(Nx+2), d_left_send_buffer[tid], pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH), cudaMemcpyDefault));
			}

			#pragma omp barrier

			// Copy right boundary data to device 1
			if (tid == 1)
			{
				checkCuda(cudaSetDevice(tid));
				
				checkCuda(cudaMemcpy2D(d_left_receive_buffer[tid], pitch_gc_bytes, right_send_buffer[tid-1], dt_size*(Nx+2), dt_size*(Nx+2), ((Ny+2)*(_GC_DEPTH)), cudaMemcpyDefault));
				CopyGhostCellToBoundaryRegion(thread_blocks_halo, threads_per_block, d_s_Unews[tid], d_left_receive_buffer[tid], Nx, Ny, _Nz, pitch, gc_pitch, 1);
			}

			// Copy left boundary data to device 0
			if (tid == 0)
			{
				checkCuda(cudaSetDevice(tid));

				checkCuda(cudaMemcpy2D(d_right_receive_buffer[tid], pitch_gc_bytes, left_send_buffer[tid+1], dt_size*(Nx+2), dt_size*(Nx+2), ((Ny+2)*(_GC_DEPTH)), cudaMemcpyDefault));
				CopyGhostCellToBoundaryRegion(thread_blocks_halo, threads_per_block, d_s_Unews[tid], d_right_receive_buffer[tid], Nx, Ny, _Nz, pitch, gc_pitch, 0);
			}

			// Swap pointers on the host
			#pragma omp barrier
			checkCuda(cudaSetDevice(tid));
			checkCuda(cudaDeviceSynchronize());
			swap(_DOUBLE_*, d_s_Unews[tid], d_s_Uolds[tid]);
		}
	}

	compute_timer += omp_get_wtime();

  // Copy data from device to host
	double DtH_timer = 0;
  DtH_timer -= omp_get_wtime();
	#pragma omp parallel
	{
		unsigned int tid = omp_get_thread_num();
		checkCuda(cudaSetDevice(tid));
		checkCuda(cudaMemcpy2D(h_s_Uolds[tid], dt_size*(Nx+2), d_s_Uolds[tid], pitch_bytes, dt_size*(Nx+2), (Ny+2)*(_Nz+2), cudaMemcpyDeviceToHost));
	}
	DtH_timer += omp_get_wtime();

	// Merge sub-domains into a one big domain
	#pragma omp parallel
	{
		unsigned int tid = omp_get_thread_num();

		merge_domains(h_s_Uolds[tid], h_Uold, Nx, Ny, _Nz, tid);
	}

   	// Calculate on host
#if defined(DEBUG) || defined(_DEBUG)
	cpu_heat3D(u_new, u_old, c0, c1, max_iters, Nx, Ny, Nz);
#endif

    float gflops = CalcGflops(compute_timer, max_iters, Nx, Ny, Nz);
    PrintSummary("3D Heat (7-pt)", "Plane sweeping", compute_timer, HtD_timer, DtH_timer, gflops, max_iters, Nx);

    _DOUBLE_ t = max_iters * dt;
    CalcError(h_Uold, u_old, t, h, Nx, Ny, Nz);

#if defined(DEBUG) || defined(_DEBUG)
    //exportToVTK(h_Uold, h, "heat3D.vtk", Nx, Ny, Nz);
#endif

	#pragma omp parallel
	{
		unsigned int tid = omp_get_thread_num();
		
		checkCuda(cudaSetDevice(tid));
		checkCuda(cudaFree(d_s_Unews[tid]));
    checkCuda(cudaFree(d_s_Uolds[tid]));
    checkCuda(cudaFree(d_right_send_buffer[tid]));
    checkCuda(cudaFree(d_left_send_buffer[tid]));
    checkCuda(cudaFree(d_right_receive_buffer[tid]));
    checkCuda(cudaFree(d_left_receive_buffer[tid]));
    checkCuda(cudaFreeHost(h_s_Unews[tid]));
    checkCuda(cudaFreeHost(h_s_Uolds[tid]));
    checkCuda(cudaFreeHost(left_send_buffer[tid]));
    checkCuda(cudaFreeHost(right_send_buffer[tid]));
    checkCuda(cudaFreeHost(left_receive_buffer[tid]));
    checkCuda(cudaFreeHost(right_receive_buffer[tid]));
    checkCuda(cudaDeviceReset());
  }

  free(u_old);
  free(u_new);

	return 0;
}
예제 #19
0
파일: Async.c 프로젝트: mohamso/icpads14
///////////////////////
// Main program entry
///////////////////////
int main(int argc, char** argv)
{
	unsigned int max_iters, Nx, Ny, Nz, blockX, blockY, blockZ;
	int rank, numberOfProcesses;

	if (argc == 8)
	{
		Nx = atoi(argv[1]);
		Ny = atoi(argv[2]);
		Nz = atoi(argv[3]);
		max_iters = atoi(argv[4]);
		blockX = atoi(argv[5]);
		blockY = atoi(argv[6]);
		blockZ = atoi(argv[7]);
	}
	else
	{
		printf("Usage: %s nx ny nz i block_x block_y block_z\n", argv[0]);
		exit(1);
	}

  InitializeMPI(&argc, &argv, &rank, &numberOfProcesses);
  AssignDevices(rank);
  ECCCheck(rank);

	// Define constants
	const _DOUBLE_ L = 1.0;
	const _DOUBLE_ h = L/(Nx+1);
	const _DOUBLE_ dt = h*h/6.0;
	const _DOUBLE_ beta = dt/(h*h);
	const _DOUBLE_ c0 = beta;
	const _DOUBLE_ c1 = (1-6*beta);

	// Copy constants to Constant Memory on the GPUs
	CopyToConstantMemory(c0, c1);

	// Decompose along the z-axis
	const int _Nz = Nz/numberOfProcesses;
  const int dt_size = sizeof(_DOUBLE_);

    // Host memory allocations
    _DOUBLE_ *u_new, *u_old;
    _DOUBLE_ *h_Uold;

    u_new = (_DOUBLE_*)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(Nz+2));
    u_old = (_DOUBLE_*)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(Nz+2));

    if (rank == 0)
    {
    	h_Uold = (_DOUBLE_*)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(Nz+2)); 
    }

    init(u_old, u_new, h, Nx, Ny, Nz);

    // Allocate and generate host subdomains
    _DOUBLE_ *h_s_Uolds, *h_s_Unews, *h_s_rbuf[numberOfProcesses];
    _DOUBLE_ *left_send_buffer, *left_receive_buffer;
    _DOUBLE_ *right_send_buffer, *right_receive_buffer;

    h_s_Unews = (_DOUBLE_*)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_Nz+2));
    h_s_Uolds = (_DOUBLE_*)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_Nz+2));

#if defined(DEBUG) || defined(_DEBUG)
  if (rank == 0)
  {
    for (int i = 0; i < numberOfProcesses; i++)
    {
        h_s_rbuf[i] = (_DOUBLE_*)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_Nz+2));
        checkCuda(cudaHostAlloc((void**)&h_s_rbuf[i], dt_size*(Nx+2)*(Ny+2)*(_Nz+2), cudaHostAllocPortable));
    }
  }
#endif

    right_send_buffer = (_DOUBLE_*)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_GC_DEPTH));
    left_send_buffer = (_DOUBLE_*)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_GC_DEPTH));
    right_receive_buffer = (_DOUBLE_*)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_GC_DEPTH));
    left_receive_buffer = (_DOUBLE_*)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_GC_DEPTH));

    checkCuda(cudaHostAlloc((void**)&h_s_Unews, dt_size*(Nx+2)*(Ny+2)*(_Nz+2), cudaHostAllocPortable));
    checkCuda(cudaHostAlloc((void**)&h_s_Uolds, dt_size*(Nx+2)*(Ny+2)*(_Nz+2), cudaHostAllocPortable));

    checkCuda(cudaHostAlloc((void**)&right_send_buffer, dt_size*(Nx+2)*(Ny+2)*(_GC_DEPTH), cudaHostAllocPortable));
    checkCuda(cudaHostAlloc((void**)&left_send_buffer, dt_size*(Nx+2)*(Ny+2)*(_GC_DEPTH), cudaHostAllocPortable));
    checkCuda(cudaHostAlloc((void**)&right_receive_buffer, dt_size*(Nx+2)*(Ny+2)*(_GC_DEPTH), cudaHostAllocPortable));
    checkCuda(cudaHostAlloc((void**)&left_receive_buffer, dt_size*(Nx+2)*(Ny+2)*(_GC_DEPTH), cudaHostAllocPortable));

    init_subdomain(h_s_Uolds, u_old, Nx, Ny, _Nz, rank);

	// GPU stream operations
	cudaStream_t compute_stream;
	cudaStream_t data_stream;

	checkCuda(cudaStreamCreate(&compute_stream));
	checkCuda(cudaStreamCreate(&data_stream));

	// GPU Memory Operations
	size_t pitch_bytes, pitch_gc_bytes;

  _DOUBLE_ *d_s_Unews, *d_s_Uolds;
  _DOUBLE_ *d_right_send_buffer, *d_left_send_buffer;
  _DOUBLE_ *d_right_receive_buffer, *d_left_receive_buffer;

  checkCuda(cudaMallocPitch((void**)&d_s_Uolds, &pitch_bytes, dt_size*(Nx+2), (Ny+2)*(_Nz+2)));
  checkCuda(cudaMallocPitch((void**)&d_s_Unews, &pitch_bytes, dt_size*(Nx+2), (Ny+2)*(_Nz+2)));

  checkCuda(cudaMallocPitch((void**)&d_left_send_buffer, &pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH)));
  checkCuda(cudaMallocPitch((void**)&d_left_receive_buffer, &pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH)));
  checkCuda(cudaMallocPitch((void**)&d_right_send_buffer, &pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH)));
  checkCuda(cudaMallocPitch((void**)&d_right_receive_buffer, &pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH)));

	// Copy subdomains from host to device and get walltime
	double HtD_timer = 0.;

	MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD));
	HtD_timer -= MPI_Wtime();
  MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD));

  checkCuda(cudaMemcpy2D(d_s_Uolds, pitch_bytes, h_s_Uolds, dt_size*(Nx+2), dt_size*(Nx+2), ((Ny+2)*(_Nz+2)), cudaMemcpyDefault));
  checkCuda(cudaMemcpy2D(d_s_Unews, pitch_bytes, h_s_Unews, dt_size*(Nx+2), dt_size*(Nx+2), ((Ny+2)*(_Nz+2)), cudaMemcpyDefault));

  MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD));
	HtD_timer += MPI_Wtime();
	MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD));

	unsigned int ghost_width = 1;

	int pitch = pitch_bytes/dt_size;
	int gc_pitch = pitch_gc_bytes/dt_size;

  // GPU kernel launch parameters
	dim3 threads_per_block(blockX, blockY, blockZ);
	unsigned int blocksInX = getBlock(Nx, blockX);
	unsigned int blocksInY = getBlock(Ny, blockY);
	unsigned int blocksInZ = getBlock(_Nz-2, k_loop);

	dim3 thread_blocks(blocksInX, blocksInY, blocksInZ);
	dim3 thread_blocks_halo(blocksInX, blocksInY);

	//MPI_Status status;
	MPI_Status status[numberOfProcesses];
	MPI_Request gather_send_request[numberOfProcesses];
	MPI_Request right_send_request[numberOfProcesses], left_send_request[numberOfProcesses];
	MPI_Request right_receive_request[numberOfProcesses], left_receive_request[numberOfProcesses];

	double compute_timer = 0.;

	MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD));
  compute_timer -= MPI_Wtime();
  MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD));

	for(unsigned int iterations = 0; iterations < max_iters; iterations++)
	{
		// Compute right boundary data on device 0
		if (rank == 0) {
      int kstart = (_Nz+1)-ghost_width;
	    int kstop = _Nz+1;

			ComputeInnerPointsAsync(thread_blocks_halo, threads_per_block, data_stream, d_s_Unews, d_s_Uolds, pitch, Nx, Ny, _Nz, kstart, kstop);
			CopyBoundaryRegionToGhostCellAsync(thread_blocks_halo, threads_per_block, data_stream, d_s_Unews, d_right_send_buffer, Nx, Ny, _Nz, pitch, gc_pitch, 0);
			
			checkCuda(cudaMemcpy2DAsync(right_send_buffer, dt_size*(Nx+2), d_right_send_buffer, pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH), cudaMemcpyDefault, data_stream));
			checkCuda(cudaStreamSynchronize(data_stream));

			MPI_CHECK(MPI_Isend(right_send_buffer, (Nx+2)*(Ny+2)*(_GC_DEPTH), MPI_DOUBLE, 1, 0, MPI_COMM_WORLD, &right_send_request[rank]));
		}
		else
		{
			int kstart = 1;
			int kstop = 1+ghost_width;

			ComputeInnerPointsAsync(thread_blocks_halo, threads_per_block, data_stream, d_s_Unews, d_s_Uolds, pitch, Nx, Ny, _Nz, kstart, kstop);
			CopyBoundaryRegionToGhostCellAsync(thread_blocks_halo, threads_per_block, data_stream, d_s_Unews, d_left_send_buffer, Nx, Ny, _Nz, pitch, gc_pitch, 1);
			
			checkCuda(cudaMemcpy2DAsync(left_send_buffer, dt_size*(Nx+2), d_left_send_buffer, pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH), cudaMemcpyDefault, data_stream));
			checkCuda(cudaStreamSynchronize(data_stream));

			MPI_CHECK(MPI_Isend(left_send_buffer, (Nx+2)*(Ny+2)*(_GC_DEPTH), MPI_DOUBLE, 0, 1, MPI_COMM_WORLD, &left_send_request[rank]));
		}

		// Compute inner nodes for device 0
		if (rank == 0) {
			int kstart = 1;
			int kstop = (_Nz+1)-ghost_width;

			ComputeInnerPointsAsync(thread_blocks, threads_per_block, compute_stream, d_s_Unews, d_s_Uolds, pitch, Nx, Ny, _Nz, kstart, kstop);
		}
		// Compute inner nodes for device 1
		else
		{
			int kstart = 1+ghost_width;
			int kstop = _Nz+1;

			ComputeInnerPointsAsync(thread_blocks, threads_per_block, compute_stream, d_s_Unews, d_s_Uolds, pitch, Nx, Ny, _Nz, kstart, kstop);
		}

		// Receive data from device 1
		if (rank == 0) {
			MPI_CHECK(MPI_Irecv(left_send_buffer, (Nx+2)*(Ny+2)*(_GC_DEPTH), MPI_DOUBLE, 1, 1, MPI_COMM_WORLD, &right_receive_request[rank]));
		}
		else
		{
			MPI_CHECK(MPI_Irecv(right_send_buffer, (Nx+2)*(Ny+2)*(_GC_DEPTH), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD, &left_receive_request[rank]));
		}

		if (rank == 0) {
      MPI_CHECK(MPI_Wait(&right_receive_request[rank], &status[rank]));

			checkCuda(cudaMemcpy2DAsync(d_right_receive_buffer, pitch_gc_bytes, left_send_buffer, dt_size*(Nx+2), dt_size*(Nx+2), ((Ny+2)*(_GC_DEPTH)), cudaMemcpyDefault, data_stream));
			CopyGhostCellToBoundaryRegionAsync(thread_blocks_halo, threads_per_block, data_stream, d_s_Unews, d_right_receive_buffer, Nx, Ny, _Nz, pitch, gc_pitch, 0);
		}
		else
		{
			MPI_CHECK(MPI_Wait(&left_receive_request[rank], &status[rank]));

			checkCuda(cudaMemcpy2DAsync(d_left_receive_buffer, pitch_gc_bytes, right_send_buffer, dt_size*(Nx+2), dt_size*(Nx+2), ((Ny+2)*(_GC_DEPTH)), cudaMemcpyDefault, data_stream));
			CopyGhostCellToBoundaryRegionAsync(thread_blocks_halo, threads_per_block, data_stream, d_s_Unews, d_left_receive_buffer, Nx, Ny, _Nz, pitch, gc_pitch, 1);
		}

		if (rank == 0)
		{
			MPI_CHECK(MPI_Wait(&right_send_request[rank], MPI_STATUS_IGNORE));
		}
		else
		{
			MPI_CHECK(MPI_Wait(&left_send_request[rank], MPI_STATUS_IGNORE));
		}

		// Swap pointers on the host
		checkCuda(cudaDeviceSynchronize());
		swap(_DOUBLE_*, d_s_Unews, d_s_Uolds);
	}

	MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD));
	compute_timer += MPI_Wtime();
	MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD));

	// Copy data from device to host
	double DtH_timer = 0;

	MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD));
  DtH_timer -= MPI_Wtime();
  MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD));

	checkCuda(cudaMemcpy2D(h_s_Uolds, dt_size*(Nx+2), d_s_Uolds, pitch_bytes, dt_size*(Nx+2), (Ny+2)*(_Nz+2), cudaMemcpyDefault));

	MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD));
	DtH_timer += MPI_Wtime();
	MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD));

  // Gather results from subdomains
  MPI_CHECK(MPI_Isend(h_s_Uolds, (Nx+2)*(Ny+2)*(_Nz+2), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD, &gather_send_request[rank]));

	if (rank == 0)
	{
		for (int i = 0; i < numberOfProcesses; i++)
		{
			MPI_CHECK(MPI_Recv(h_s_rbuf[i], (Nx+2)*(Ny+2)*(_Nz+2), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, &status[rank]));
			merge_domains(h_s_rbuf[i], h_Uold, Nx, Ny, _Nz, i);
		}
	}

	// Calculate on host
#if defined(DEBUG) || defined(_DEBUG)
	if (rank == 0)
	{
		cpu_heat3D(u_new, u_old, c0, c1, max_iters, Nx, Ny, Nz);
	}
#endif

	if (rank == 0)
	{
		float gflops = CalcGflops(compute_timer, max_iters, Nx, Ny, Nz);
		PrintSummary("3D Heat (7-pt)", "Plane sweeping", compute_timer, HtD_timer, DtH_timer, gflops, max_iters, Nx);

		_DOUBLE_ t = max_iters * dt;
		CalcError(h_Uold, u_old, t, h, Nx, Ny, Nz);
	}

	Finalize();

  // Free device memory
  checkCuda(cudaFree(d_s_Unews));
  checkCuda(cudaFree(d_s_Uolds));
  checkCuda(cudaFree(d_right_send_buffer));
  checkCuda(cudaFree(d_left_send_buffer));
  checkCuda(cudaFree(d_right_receive_buffer));
  checkCuda(cudaFree(d_left_receive_buffer));

  // Free host memory
  checkCuda(cudaFreeHost(h_s_Unews));
  checkCuda(cudaFreeHost(h_s_Uolds));

#if defined(DEBUG) || defined(_DEBUG)
  if (rank == 0)
  {
  	for (int i = 0; i < numberOfProcesses; i++)
  	{
  		checkCuda(cudaFreeHost(h_s_rbuf[i]));
  	}

    free(h_Uold);
  }
#endif

  checkCuda(cudaFreeHost(left_send_buffer));
  checkCuda(cudaFreeHost(left_receive_buffer));
  checkCuda(cudaFreeHost(right_send_buffer));
  checkCuda(cudaFreeHost(right_receive_buffer));

  checkCuda(cudaDeviceReset());

  free(u_old);
  free(u_new);

  return 0;
}
예제 #20
0
  void CG3::operator()(cudaColorSpinorField &x, cudaColorSpinorField &b) 
  {

    // Check to see that we're not trying to invert on a zero-field source    
    const double b2 = norm2(b);
    if(b2 == 0){
      profile.TPSTOP(QUDA_PROFILE_INIT);
      printfQuda("Warning: inverting on zero-field source\n");
      x=b;
      param.true_res = 0.0;
      param.true_res_hq = 0.0;
      return;
    }

    ColorSpinorParam csParam(x);
    csParam.create = QUDA_ZERO_FIELD_CREATE;
  
    
    cudaColorSpinorField x_prev(b, csParam);  
    cudaColorSpinorField r_prev(b, csParam);
    cudaColorSpinorField temp(b, csParam);

    cudaColorSpinorField r(b);
    cudaColorSpinorField w(b);


    mat(r, x, temp);  // r = Mx
    double r2 = xmyNormCuda(b,r); // r = b - Mx
    PrintStats("CG3", 0, r2, b2, 0.0);


    double stop = stopping(param.tol, b2, param.residual_type);
    if(convergence(r2, 0.0, stop, 0.0)) return;
    // First iteration 
    mat(w, r, temp);
    double rAr = reDotProductCuda(r,w);
    double rho = 1.0;
    double gamma_prev = 0.0;
    double gamma = r2/rAr;


    cudaColorSpinorField x_new(x);
    cudaColorSpinorField r_new(r);
    axpyCuda(gamma, r, x_new);  // x_new += gamma*r
    axpyCuda(-gamma, w, r_new); // r_new -= gamma*w
    // end of first iteration  

    // axpbyCuda(a,b,x,y) => y = a*x + b*y

    int k = 1; // First iteration performed above

    double r2_prev;
    while(!convergence(r2, 0.0, stop, 0.0) && k<param.maxiter){
      x_prev = x; x = x_new;
      r_prev = r; r = r_new;
      mat(w, r, temp);
      rAr = reDotProductCuda(r,w);
      r2_prev = r2;
      r2 = norm2(r);

      // Need to rearrange this!
      PrintStats("CG3", k, r2, b2, 0.0);

      gamma_prev = gamma;
      gamma = r2/rAr;
      rho = 1.0/(1. - (gamma/gamma_prev)*(r2/r2_prev)*(1.0/rho));
      
      x_new = x;
      axCuda(rho,x_new); 
      axpyCuda(rho*gamma,r,x_new);
      axpyCuda((1. - rho),x_prev,x_new);

      r_new = r;
      axCuda(rho,r_new);
      axpyCuda(-rho*gamma,w,r_new);
      axpyCuda((1.-rho),r_prev,r_new);


       double rr_old = reDotProductCuda(r_new,r);
      printfQuda("rr_old = %1.14lf\n", rr_old);


 
      k++;
    }


    if(k == param.maxiter)
      warningQuda("Exceeded maximum iterations %d", param.maxiter);

    // compute the true residual
    mat(r, x, temp);
    param.true_res = sqrt(xmyNormCuda(b, r)/b2);

    PrintSummary("CG3", k, r2, b2);

    return;
  }
예제 #21
0
  void CG::operator()(cudaColorSpinorField &x, cudaColorSpinorField &b) 
  {
    profile.Start(QUDA_PROFILE_INIT);

    // Check to see that we're not trying to invert on a zero-field source    
    const double b2 = norm2(b);
    if(b2 == 0){
      profile.Stop(QUDA_PROFILE_INIT);
      printfQuda("Warning: inverting on zero-field source\n");
      x=b;
      param.true_res = 0.0;
      param.true_res_hq = 0.0;
      return;
    }


    cudaColorSpinorField r(b);

    ColorSpinorParam csParam(x);
    csParam.create = QUDA_ZERO_FIELD_CREATE;
    cudaColorSpinorField y(b, csParam); 
  
    mat(r, x, y);
//    zeroCuda(y);

    double r2 = xmyNormCuda(b, r);
  
    csParam.setPrecision(param.precision_sloppy);
    cudaColorSpinorField Ap(x, csParam);
    cudaColorSpinorField tmp(x, csParam);

    cudaColorSpinorField *tmp2_p = &tmp;
    // tmp only needed for multi-gpu Wilson-like kernels
    if (mat.Type() != typeid(DiracStaggeredPC).name() && 
	mat.Type() != typeid(DiracStaggered).name()) {
      tmp2_p = new cudaColorSpinorField(x, csParam);
    }
    cudaColorSpinorField &tmp2 = *tmp2_p;

    cudaColorSpinorField *x_sloppy, *r_sloppy;
    if (param.precision_sloppy == x.Precision()) {
      csParam.create = QUDA_REFERENCE_FIELD_CREATE;
      x_sloppy = &x;
      r_sloppy = &r;
    } else {
      csParam.create = QUDA_COPY_FIELD_CREATE;
      x_sloppy = new cudaColorSpinorField(x, csParam);
      r_sloppy = new cudaColorSpinorField(r, csParam);
    }

    cudaColorSpinorField &xSloppy = *x_sloppy;
    cudaColorSpinorField &rSloppy = *r_sloppy;
    cudaColorSpinorField p(rSloppy);

    if(&x != &xSloppy){
      copyCuda(y,x);
      zeroCuda(xSloppy);
    }else{
      zeroCuda(y);
    }
    
    const bool use_heavy_quark_res = 
      (param.residual_type & QUDA_HEAVY_QUARK_RESIDUAL) ? true : false;
    
    profile.Stop(QUDA_PROFILE_INIT);
    profile.Start(QUDA_PROFILE_PREAMBLE);

    double r2_old;
    double stop = b2*param.tol*param.tol; // stopping condition of solver

    double heavy_quark_res = 0.0; // heavy quark residual
    if(use_heavy_quark_res) heavy_quark_res = sqrt(HeavyQuarkResidualNormCuda(x,r).z);
    int heavy_quark_check = 10; // how often to check the heavy quark residual

    double alpha=0.0, beta=0.0;
    double pAp;
    int rUpdate = 0;

    double rNorm = sqrt(r2);
    double r0Norm = rNorm;
    double maxrx = rNorm;
    double maxrr = rNorm;
    double delta = param.delta;

    // this parameter determines how many consective reliable update
    // reisudal increases we tolerate before terminating the solver,
    // i.e., how long do we want to keep trying to converge
    int maxResIncrease = 0; // 0 means we have no tolerance 

    profile.Stop(QUDA_PROFILE_PREAMBLE);
    profile.Start(QUDA_PROFILE_COMPUTE);
    blas_flops = 0;

    int k=0;
    
    PrintStats("CG", k, r2, b2, heavy_quark_res);

    int steps_since_reliable = 1;

    while ( !convergence(r2, heavy_quark_res, stop, param.tol_hq) && 
	    k < param.maxiter) {
      matSloppy(Ap, p, tmp, tmp2); // tmp as tmp
    
      double sigma;

      bool breakdown = false;

      if (param.pipeline) {
	double3 triplet = tripleCGReductionCuda(rSloppy, Ap, p);
	r2 = triplet.x; double Ap2 = triplet.y; pAp = triplet.z;
	r2_old = r2;

	alpha = r2 / pAp;        
	sigma = alpha*(alpha * Ap2 - pAp);
	if (sigma < 0.0 || steps_since_reliable==0) { // sigma condition has broken down
	  r2 = axpyNormCuda(-alpha, Ap, rSloppy);
	  sigma = r2;
	  breakdown = true;
	}

	r2 = sigma;
      } else {
	r2_old = r2;
	pAp = reDotProductCuda(p, Ap);
	alpha = r2 / pAp;        

	// here we are deploying the alternative beta computation 
	Complex cg_norm = axpyCGNormCuda(-alpha, Ap, rSloppy);
	r2 = real(cg_norm); // (r_new, r_new)
	sigma = imag(cg_norm) >= 0.0 ? imag(cg_norm) : r2; // use r2 if (r_k+1, r_k+1-r_k) breaks
      }

      // reliable update conditions
      rNorm = sqrt(r2);
      if (rNorm > maxrx) maxrx = rNorm;
      if (rNorm > maxrr) maxrr = rNorm;
      int updateX = (rNorm < delta*r0Norm && r0Norm <= maxrx) ? 1 : 0;
      int updateR = ((rNorm < delta*maxrr && r0Norm <= maxrr) || updateX) ? 1 : 0;
    
      // force a reliable update if we are within target tolerance (only if doing reliable updates)
      if ( convergence(r2, heavy_quark_res, stop, param.tol_hq) && delta >= param.tol) updateX = 1;

      if ( !(updateR || updateX)) {
	//beta = r2 / r2_old;
	beta = sigma / r2_old; // use the alternative beta computation

	if (param.pipeline && !breakdown) tripleCGUpdateCuda(alpha, beta, Ap, rSloppy, xSloppy, p);
	else axpyZpbxCuda(alpha, p, xSloppy, rSloppy, beta);

	if (use_heavy_quark_res && k%heavy_quark_check==0) { 
	  copyCuda(tmp,y);
	  heavy_quark_res = sqrt(xpyHeavyQuarkResidualNormCuda(xSloppy, tmp, rSloppy).z);
	}

	steps_since_reliable++;
      } else {
	axpyCuda(alpha, p, xSloppy);
	if (x.Precision() != xSloppy.Precision()) copyCuda(x, xSloppy);
      
	xpyCuda(x, y); // swap these around?
	mat(r, y, x); // here we can use x as tmp
	r2 = xmyNormCuda(b, r);

	if (x.Precision() != rSloppy.Precision()) copyCuda(rSloppy, r);            
	zeroCuda(xSloppy);

	// break-out check if we have reached the limit of the precision
	static int resIncrease = 0;
	if (sqrt(r2) > r0Norm && updateX) { // reuse r0Norm for this
	  warningQuda("CG: new reliable residual norm %e is greater than previous reliable residual norm %e", sqrt(r2), r0Norm);
	  k++;
	  rUpdate++;
	  if (++resIncrease > maxResIncrease) break; 
	} else {
	  resIncrease = 0;
	}

	rNorm = sqrt(r2);
	maxrr = rNorm;
	maxrx = rNorm;
	r0Norm = rNorm;      
	rUpdate++;

	// explicitly restore the orthogonality of the gradient vector
	double rp = reDotProductCuda(rSloppy, p) / (r2);
	axpyCuda(-rp, rSloppy, p);

	beta = r2 / r2_old; 
	xpayCuda(rSloppy, beta, p);

	if(use_heavy_quark_res) heavy_quark_res = sqrt(HeavyQuarkResidualNormCuda(y,r).z);
	
	steps_since_reliable = 0;
      }

      breakdown = false;
      k++;

      PrintStats("CG", k, r2, b2, heavy_quark_res);
    }

    if (x.Precision() != xSloppy.Precision()) copyCuda(x, xSloppy);
    xpyCuda(y, x);

    profile.Stop(QUDA_PROFILE_COMPUTE);
    profile.Start(QUDA_PROFILE_EPILOGUE);

    param.secs = profile.Last(QUDA_PROFILE_COMPUTE);
    double gflops = (quda::blas_flops + mat.flops() + matSloppy.flops())*1e-9;
    reduceDouble(gflops);
      param.gflops = gflops;
    param.iter += k;

    if (k==param.maxiter) 
      warningQuda("Exceeded maximum iterations %d", param.maxiter);

    if (getVerbosity() >= QUDA_VERBOSE)
      printfQuda("CG: Reliable updates = %d\n", rUpdate);

    // compute the true residuals
    mat(r, x, y);
    param.true_res = sqrt(xmyNormCuda(b, r) / b2);
#if (__COMPUTE_CAPABILITY__ >= 200)
    param.true_res_hq = sqrt(HeavyQuarkResidualNormCuda(x,r).z);
#else
    param.true_res_hq = 0.0;
#endif      

    PrintSummary("CG", k, r2, b2);

    // reset the flops counters
    quda::blas_flops = 0;
    mat.flops();
    matSloppy.flops();

    profile.Stop(QUDA_PROFILE_EPILOGUE);
    profile.Start(QUDA_PROFILE_FREE);

    if (&tmp2 != &tmp) delete tmp2_p;

    if (param.precision_sloppy != x.Precision()) {
      delete r_sloppy;
      delete x_sloppy;
    }

    profile.Stop(QUDA_PROFILE_FREE);

    return;
  }
예제 #22
0
int mySetState(void * hndl, UserState * ustate, unsigned int privdata)
{
    int val;
    static unsigned int testmode;

    log_verbose(KERN_INFO "Reached mySetState with privdata %x\n", privdata);

    /* Check driver state */
    if(DriverState != REGISTERED)
    {
        printk("Driver does not seem to be ready\n");
        return EFAULT;
    }

    /* Check handle value */
    if((hndl != handle[0]) && (hndl != handle[2]))
    {
        printk("Came with wrong handle\n");
        return EBADF;
    }

    /* Valid only for TX engine */
    if(privdata == 0x54545454)
    {
        spin_lock_bh(&RawLock);

        /* Set up the value to be written into the register */
        RawTestMode = ustate->TestMode;

        if(RawTestMode & TEST_START)
        {
            testmode = 0;
            if(RawTestMode & ENABLE_LOOPBACK) testmode |= LOOPBACK;
            if(RawTestMode & ENABLE_PKTCHK) testmode |= PKTCHKR;
            if(RawTestMode & ENABLE_PKTGEN) testmode |= PKTGENR;
        }
        else
        {
            /* Deliberately not clearing the loopback bit, incase a
             * loopback test was going on - allows the loopback path
             * to drain off packets. Just stopping the source of packets.
             */
            if(RawTestMode & ENABLE_PKTCHK) testmode &= ~PKTCHKR;
            if(RawTestMode & ENABLE_PKTGEN) testmode &= ~PKTGENR;
        }

        printk("SetState TX with RawTestMode %x, reg value %x\n",
               RawTestMode, testmode);

        /* Now write the registers */
        if(RawTestMode & TEST_START)
        {
            if(!(RawTestMode & (ENABLE_PKTCHK|ENABLE_PKTGEN|ENABLE_LOOPBACK)))
            {
                printk("%s Driver: TX Test Start with wrong mode %x\n",
                       MYNAME, testmode);
                RawTestMode = 0;
                spin_unlock_bh(&RawLock);
                return EBADRQC;
            }

            printk("%s Driver: Starting the test - mode %x, reg %x\n",
                   MYNAME, RawTestMode, testmode);

            /* Next, set packet sizes. Ensure they don't exceed PKTSIZEs */
            RawMinPktSize = ustate->MinPktSize;
            RawMaxPktSize = ustate->MaxPktSize;

            /* Set RX packet size for memory path */
            val = RawMaxPktSize;
            if(val % BYTEMULTIPLE)
            {
                printk("********** ODD PACKET SIZE **********\n");
                //val -= (val % BYTEMULTIPLE);
            }
            printk("Reg %x = %x\n", PKT_SIZE_ADDRESS, val);
            RawMinPktSize = RawMaxPktSize = val;

            /* Now ensure the sizes remain within bounds */
            if(RawMaxPktSize > MAXPKTSIZE)
                RawMinPktSize = RawMaxPktSize = MAXPKTSIZE;
            if(RawMinPktSize < MINPKTSIZE)
                RawMinPktSize = RawMaxPktSize = MINPKTSIZE;
            if(RawMinPktSize > RawMaxPktSize)
                RawMinPktSize = RawMaxPktSize;
            val = RawMaxPktSize;

            printk("========Reg %x = %d\n", PKT_SIZE_ADDRESS, val);
            XIo_Out32(TXbarbase+PKT_SIZE_ADDRESS, val);
            printk("RxPktSize %d\n", val);

            /*
                  #ifdef XRAWDATA0
                    XIo_Out32(TXbarbase+START_ADRS_0, 0x00000000);
                    XIo_Out32(TXbarbase+START_ADRS_1, 0x04000000);
                    XIo_Out32(TXbarbase+END_ADRS_0, 0x03000000);
                    XIo_Out32(TXbarbase+END_ADRS_1, 0x07000000);
                    XIo_Out32(TXbarbase+WRBURST_0, BURST_SIZE );
                    XIo_Out32(TXbarbase+RDBURST_0, BURST_SIZE );
                    XIo_Out32(TXbarbase+WRBURST_1, BURST_SIZE );
                    XIo_Out32(TXbarbase+RDBURST_1, BURST_SIZE );
                  #else
                    XIo_Out32(TXbarbase+START_ADRS_2, 0x08000000);
                    XIo_Out32(TXbarbase+START_ADRS_3, 0x0C000000);
                    XIo_Out32(TXbarbase+END_ADRS_2, 0x0B000000);
                    XIo_Out32(TXbarbase+END_ADRS_3, 0x0F000000);
                    XIo_Out32(TXbarbase+WRBURST_2, BURST_SIZE );
                    XIo_Out32(TXbarbase+RDBURST_2, BURST_SIZE );
                    XIo_Out32(TXbarbase+WRBURST_3, BURST_SIZE );
                    XIo_Out32(TXbarbase+RDBURST_3, BURST_SIZE );
                  #endif
            */

            /* Incase the last test was a loopback test, that bit may not be cleared. */
            XIo_Out32(TXbarbase+TX_CONFIG_ADDRESS, 0);
            if(RawTestMode & (ENABLE_PKTCHK|ENABLE_LOOPBACK))
            {
                TxSeqNo = 0;
                if(RawTestMode & ENABLE_LOOPBACK)
                    RxSeqNo = 0;
                printk("========Reg %x = %x\n", TX_CONFIG_ADDRESS, testmode);
                XIo_Out32(TXbarbase+TX_CONFIG_ADDRESS, testmode);
            }
            if(RawTestMode & ENABLE_PKTGEN)
            {
                RxSeqNo = 0;
                printk("========Reg %x = %x\n", RX_CONFIG_ADDRESS, testmode);
                XIo_Out32(TXbarbase+RX_CONFIG_ADDRESS, testmode);
            }

        }
        /* Else, stop the test. Do not remove any loopback here because
         * the DMA queues and hardware FIFOs must drain first.
         */
        else
        {
            printk("%s Driver: Stopping the test, mode %x\n", MYNAME, testmode);
            printk("========Reg %x = %x\n", TX_CONFIG_ADDRESS, testmode);
            XIo_Out32(TXbarbase+TX_CONFIG_ADDRESS, testmode);
            printk("========Reg %x = %x\n", RX_CONFIG_ADDRESS, testmode);
            XIo_Out32(TXbarbase+RX_CONFIG_ADDRESS, testmode);

            /* Not resetting sequence numbers here - causes problems
             * in debugging. Instead, reset the sequence numbers when
             * starting a test.
             */
        }

        PrintSummary();
        spin_unlock_bh(&RawLock);
    }

    return 0;
}
예제 #23
0
int main (int argc, char **argv) 
{
    int i;
    unsigned int total_len = 0;
    struct timeval begtime,endtime;
    FILE *sfd,*pfd;
    char sfilename[20] = "string";
    char pfilename[20] = "pattern";

	
//=============================================== 
    if (argc < 4)
    {
        fprintf (stderr,"Usage: acsmx stringfile patternfile ...  -nocase\n");
        exit (0);
    }
    strcpy (sfilename, argv[1]);
    sfd = fopen(sfilename,"r");
    if(sfd == NULL)
    {
        fprintf(stderr,"Open file error!\n");
        exit(1);
    }

    strcpy(pfilename,argv[2]);
    pfd = fopen(pfilename,"r");
    if(sfd == NULL)
    {
        fprintf(stderr,"Open file error!\n");
        exit(1);
    }
    thread_num = atoi(argv[3]);
   	acsm = acsmNew (thread_num); 
   
//read patterns    
	i = 0;
    while(fgets(pattern,MAXPATTERNLEN,pfd))
    {
    	int len = strlen(pattern);
    	acsmAddPattern (acsm, pattern, len-1);
		i++;
    }
    fclose(pfd);
    printf("\n\nread %d patterns\n\n===============================",i);
    /* Generate GtoTo Table and Fail Table */
    acsmCompile (acsm);
//========================================================= 

    /*read string*/
    for(i = 0;i < MAXLINE;i++)
    {
    	if(!fgets(text[i],MAXLEN,sfd))
    		break;
   		total_len += strlen(text[i]) - 1; //ignore the last char '\n'
    }
    line = i;
    fclose(sfd);
    printf("\n\nreading finished...\n=============================\n\n");
    printf("%d lines\t%d bytes",line,total_len);
    printf("\n\n=============================\n");
    
    gettimeofday(&begtime,0);
    //create multi_thread
    thread_array = calloc(thread_num,sizeof(pthread_t));
	valid_len_array = calloc(thread_num,sizeof(unsigned int));
    pthread_barrier_init(&barrier_thread,NULL,thread_num);
    pthread_barrier_init(&barrier_validation,NULL,thread_num);
 
    for(i = 0;i < thread_num; i++)
	{
		pthread_create(&thread_array[i], NULL, SearchThread, (void*)i);
    }
//=========================================================== 
    int err;
    for(i = 0;i < thread_num;i++)
    {
        err = pthread_join(thread_array[i],NULL);
        if(err != 0)
        {
            printf("can not join with thread %d:%s\n", i,strerror(err));
        }
    }
    gettimeofday(&endtime,0);

    PrintSummary(acsm);
    acsmFree (acsm);

    printf ("\n### AC Match Finished ###\n");
    printf("\nTime Cost: %lu us\n\n",(endtime.tv_sec - begtime.tv_sec)*1000000 + (endtime.tv_usec - begtime.tv_usec));
    printf ("\n====================================\n\n");
    printf ("Validation Stage Len:\n\n");
    for(i = 0;i < thread_num;i++)
        printf("rank%d\t%u\n",i,valid_len_array[i]);
    printf ("\n====================================\n\n");
   
    free(thread_array);
    free(valid_len_array);
    pthread_barrier_destroy(&barrier_thread);
    pthread_barrier_destroy(&barrier_validation);
    return 0;
}
예제 #24
0
파일: nfdump.c 프로젝트: gitpan/libnf
int main( int argc, char **argv ) {
struct stat stat_buff;
stat_record_t	sum_stat;
printer_t 	print_header, print_record;
nfprof_t 	profile_data;
char 		*rfile, *Rfile, *Mdirs, *wfile, *ffile, *filter, *tstring, *stat_type;
char		*byte_limit_string, *packet_limit_string, *print_format, *record_header;
char		*print_order, *query_file, *UnCompress_file, *nameserver, *aggr_fmt;
int 		c, ffd, ret, element_stat, fdump;
int 		i, user_format, quiet, flow_stat, topN, aggregate, aggregate_mask, bidir;
int 		print_stat, syntax_only, date_sorted, do_tag, compress, do_xstat;
int			plain_numbers, GuessDir, pipe_output, csv_output;
time_t 		t_start, t_end;
uint32_t	limitflows;
char 		Ident[IDENTLEN];

	rfile = Rfile = Mdirs = wfile = ffile = filter = tstring = stat_type = NULL;
	byte_limit_string = packet_limit_string = NULL;
	fdump = aggregate = 0;
	aggregate_mask	= 0;
	bidir			= 0;
	t_start = t_end = 0;
	syntax_only	    = 0;
	topN	        = -1;
	flow_stat       = 0;
	print_stat      = 0;
	element_stat  	= 0;
	do_xstat 		= 0;
	limitflows		= 0;
	date_sorted		= 0;
	total_bytes		= 0;
	total_flows		= 0;
	skipped_blocks	= 0;
	do_tag			= 0;
	quiet			= 0;
	user_format		= 0;
	compress		= 0;
	plain_numbers   = 0;
	pipe_output		= 0;
	csv_output		= 0;
	is_anonymized	= 0;
	GuessDir		= 0;
	nameserver		= NULL;

	print_format    = NULL;
	print_header 	= NULL;
	print_record  	= NULL;
	print_order  	= NULL;
	query_file		= NULL;
	UnCompress_file	= NULL;
	aggr_fmt		= NULL;
	record_header 	= NULL;

	Ident[0] = '\0';

	while ((c = getopt(argc, argv, "6aA:Bbc:D:E:s:hHn:i:j:f:qzr:v:w:K:M:NImO:R:XZt:TVv:x:l:L:o:")) != EOF) {
		switch (c) {
			case 'h':
				usage(argv[0]);
				exit(0);
				break;
			case 'a':
				aggregate = 1;
				break;
			case 'A':
				if ( !ParseAggregateMask(optarg, &aggr_fmt ) ) {
					exit(255);
				}
				aggregate_mask = 1;
				break;
			case 'B':
				GuessDir = 1;
			case 'b':
				if ( !SetBidirAggregation() ) {
					exit(255);
				}
				bidir	  = 1;
				// implies
				aggregate = 1;
				break;
			case 'D':
				nameserver = optarg;
				if ( !set_nameserver(nameserver) ) {
					exit(255);
				}
				break;
			case 'E':
				query_file = optarg;
				if ( !InitExporterList() ) {
					exit(255);
				}
				PrintExporters(query_file);
				exit(0);
				break;
			case 'X':
				fdump = 1;
				break;
			case 'Z':
				syntax_only = 1;
				break;
			case 'q':
				quiet = 1;
				break;
			case 'z':
				compress = 1;
				break;
			case 'c':	
				limitflows = atoi(optarg);
				if ( !limitflows ) {
					LogError("Option -c needs a number > 0\n");
					exit(255);
				}
				break;
			case 's':
				stat_type = optarg;
                if ( !SetStat(stat_type, &element_stat, &flow_stat) ) {
                    exit(255);
                } 
				break;
			case 'V': {
				char *e1, *e2;
				e1 = "";
				e2 = "";
#ifdef NSEL
				e1 = "NSEL-NEL";
#endif
				printf("%s: Version: %s%s%s\n",argv[0], e1, e2, nfdump_version);
				exit(0);
				} break;
			case 'l':
				packet_limit_string = optarg;
				break;
			case 'K':
				LogError("*** Anonymisation moved! Use nfanon to anonymise flows!\n");
				exit(255);
				break;
			case 'H':
				do_xstat = 1;
				break;
			case 'L':
				byte_limit_string = optarg;
				break;
			case 'N':
				plain_numbers = 1;
				break;
			case 'f':
				ffile = optarg;
				break;
			case 't':
				tstring = optarg;
				break;
			case 'r':
				rfile = optarg;
				if ( strcmp(rfile, "-") == 0 )
					rfile = NULL;
				break;
			case 'm':
				print_order = "tstart";
				Parse_PrintOrder(print_order);
				date_sorted = 1;
				LogError("Option -m depricated. Use '-O tstart' instead\n");
				break;
			case 'M':
				Mdirs = optarg;
				break;
			case 'I':
				print_stat++;
				break;
			case 'o':	// output mode
				print_format = optarg;
				break;
			case 'O': {	// stat order by
				int ret;
				print_order = optarg;
				ret = Parse_PrintOrder(print_order);
				if ( ret < 0 ) {
					LogError("Unknown print order '%s'\n", print_order);
					exit(255);
				}
				date_sorted = ret == 6;		// index into order_mode
				} break;
			case 'R':
				Rfile = optarg;
				break;
			case 'w':
				wfile = optarg;
				break;
			case 'n':
				topN = atoi(optarg);
				if ( topN < 0 ) {
					LogError("TopnN number %i out of range\n", topN);
					exit(255);
				}
				break;
			case 'T':
				do_tag = 1;
				break;
			case 'i':
				strncpy(Ident, optarg, IDENT_SIZE);
				Ident[IDENT_SIZE - 1] = 0;
				if ( strchr(Ident, ' ') ) {
					LogError("Ident must not contain spaces\n");
					exit(255);
				}
				break;
			case 'j':
				UnCompress_file = optarg;
				UnCompressFile(UnCompress_file);
				exit(0);
				break;
			case 'x':
				query_file = optarg;
				InitExtensionMaps(NO_EXTENSION_LIST);
				DumpExMaps(query_file);
				exit(0);
				break;
			case 'v':
				query_file = optarg;
				QueryFile(query_file);
				exit(0);
				break;
			case '6':	// print long IPv6 addr
				Setv6Mode(1);
				break;
			default:
				usage(argv[0]);
				exit(0);
		}
	}
	if (argc - optind > 1) {
		usage(argv[0]);
		exit(255);
	} else {
		/* user specified a pcap filter */
		filter = argv[optind];
		FilterFilename = NULL;
	}
	
	// Change Ident only
	if ( rfile && strlen(Ident) > 0 ) {
		ChangeIdent(rfile, Ident);
		exit(0);
	}

	if ( (element_stat || flow_stat) && (topN == -1)  ) 
		topN = 10;

	if ( topN < 0 )
		topN = 0;

	if ( (element_stat && !flow_stat) && aggregate_mask ) {
		LogError("Warning: Aggregation ignored for element statistics\n");
		aggregate_mask = 0;
	}

	if ( !flow_stat && aggregate_mask ) {
		aggregate = 1;
	}

	if ( rfile && Rfile ) {
		LogError("-r and -R are mutually exclusive. Plase specify either -r or -R\n");
		exit(255);
	}
	if ( Mdirs && !(rfile || Rfile) ) {
		LogError("-M needs either -r or -R to specify the file or file list. Add '-R .' for all files in the directories.\n");
		exit(255);
	}

	extension_map_list = InitExtensionMaps(NEEDS_EXTENSION_LIST);
	if ( !InitExporterList() ) {
		exit(255);
	}

	SetupInputFileSequence(Mdirs, rfile, Rfile);

	if ( print_stat ) {
		nffile_t *nffile;
		if ( !rfile && !Rfile && !Mdirs) {
			LogError("Expect data file(s).\n");
			exit(255);
		}

		memset((void *)&sum_stat, 0, sizeof(stat_record_t));
		sum_stat.first_seen = 0x7fffffff;
		sum_stat.msec_first = 999;
		nffile = GetNextFile(NULL, 0, 0);
		if ( !nffile ) {
			LogError("Error open file: %s\n", strerror(errno));
			exit(250);
		}
		while ( nffile && nffile != EMPTY_LIST ) {
			SumStatRecords(&sum_stat, nffile->stat_record);
			nffile = GetNextFile(nffile, 0, 0);
		}
		PrintStat(&sum_stat);
		exit(0);
	}

	// handle print mode
	if ( !print_format ) {
		// automatically select an appropriate output format for custom aggregation
		// aggr_fmt is compiled by ParseAggregateMask
		if ( aggr_fmt ) {
			int len = strlen(AggrPrependFmt) + strlen(aggr_fmt) + strlen(AggrAppendFmt) + 7;	// +7 for 'fmt:', 2 spaces and '\0'
			print_format = malloc(len);
			if ( !print_format ) {
				LogError("malloc() error in %s line %d: %s\n", __FILE__, __LINE__, strerror(errno) );
				exit(255);
			}
			snprintf(print_format, len, "fmt:%s %s %s",AggrPrependFmt, aggr_fmt, AggrAppendFmt );
			print_format[len-1] = '\0';
		} else if ( bidir ) {
			print_format = "biline";
		} else
			print_format = DefaultMode;
	}

	if ( strncasecmp(print_format, "fmt:", 4) == 0 ) {
		// special user defined output format
		char *format = &print_format[4];
		if ( strlen(format) ) {
			if ( !ParseOutputFormat(format, plain_numbers, printmap) )
				exit(255);
			print_record  = format_special;
			record_header = get_record_header();
			user_format	  = 1;
		} else {
			LogError("Missing format description for user defined output format!\n");
			exit(255);
		}
	} else {
		// predefined output format

		// Check for long_v6 mode
		i = strlen(print_format);
		if ( i > 2 ) {
			if ( print_format[i-1] == '6' ) {
				Setv6Mode(1);
				print_format[i-1] = '\0';
			} else 
				Setv6Mode(0);
		}

		i = 0;
		while ( printmap[i].printmode ) {
			if ( strncasecmp(print_format, printmap[i].printmode, MAXMODELEN) == 0 ) {
				if ( printmap[i].Format ) {
					if ( !ParseOutputFormat(printmap[i].Format, plain_numbers, printmap) )
						exit(255);
					// predefined custom format
					print_record  = printmap[i].func;
					record_header = get_record_header();
					user_format	  = 1;
				} else {
					// To support the pipe output format for element stats - check for pipe, and remember this
					if ( strncasecmp(print_format, "pipe", MAXMODELEN) == 0 ) {
						pipe_output = 1;
					}
					if ( strncasecmp(print_format, "csv", MAXMODELEN) == 0 ) {
						csv_output = 1;
						set_record_header();
						record_header = get_record_header();
					}
					// predefined static format
					print_record  = printmap[i].func;
					user_format	  = 0;
				}
				break;
			}
			i++;
		}
	}

	if ( !print_record ) {
		LogError("Unknown output mode '%s'\n", print_format);
		exit(255);
	}

	// this is the only case, where headers are printed.
	if ( strncasecmp(print_format, "raw", 16) == 0 )
		print_header = format_file_block_header;
	
	if ( aggregate && (flow_stat || element_stat) ) {
		aggregate = 0;
		LogError("Command line switch -s overwrites -a\n");
	}

	if ( !filter && ffile ) {
		if ( stat(ffile, &stat_buff) ) {
			LogError("Can't stat filter file '%s': %s\n", ffile, strerror(errno));
			exit(255);
		}
		filter = (char *)malloc(stat_buff.st_size+1);
		if ( !filter ) {
			LogError("malloc() error in %s line %d: %s\n", __FILE__, __LINE__, strerror(errno) );
			exit(255);
		}
		ffd = open(ffile, O_RDONLY);
		if ( ffd < 0 ) {
			LogError("Can't open filter file '%s': %s\n", ffile, strerror(errno));
			exit(255);
		}
		ret = read(ffd, (void *)filter, stat_buff.st_size);
		if ( ret < 0   ) {
			perror("Error reading filter file");
			close(ffd);
			exit(255);
		}
		total_bytes += ret;
		filter[stat_buff.st_size] = 0;
		close(ffd);

		FilterFilename = ffile;
	}

	// if no filter is given, set the default ip filter which passes through every flow
	if ( !filter  || strlen(filter) == 0 ) 
		filter = "any";

	Engine = CompileFilter(filter);
	if ( !Engine ) 
		exit(254);

	if ( fdump ) {
		printf("StartNode: %i Engine: %s\n", Engine->StartNode, Engine->Extended ? "Extended" : "Fast");
		DumpList(Engine);
		exit(0);
	}

	if ( syntax_only )
		exit(0);

	if ( print_order && flow_stat ) {
		printf("-s record and -O (-m) are mutually exclusive options\n");
		exit(255);
	}

	if ((aggregate || flow_stat || print_order)  && !Init_FlowTable() )
			exit(250);

	if (element_stat && !Init_StatTable(HashBits, NumPrealloc) )
			exit(250);

	SetLimits(element_stat || aggregate || flow_stat, packet_limit_string, byte_limit_string);

	if ( tstring ) {
		if ( !ScanTimeFrame(tstring, &t_start, &t_end) )
			exit(255);
	}


	if ( !(flow_stat || element_stat || wfile || quiet ) && record_header ) {
		if ( user_format ) {
			printf("%s\n", record_header);
		} else {
			// static format - no static format with header any more, but keep code anyway
			if ( Getv6Mode() ) {
				printf("%s\n", record_header);
			} else
				printf("%s\n", record_header);
		}
	}

	nfprof_start(&profile_data);
	sum_stat = process_data(wfile, element_stat, aggregate || flow_stat, print_order != NULL,
						print_header, print_record, t_start, t_end, 
						limitflows, do_tag, compress, do_xstat);
	nfprof_end(&profile_data, total_flows);

	if ( total_bytes == 0 ) {
		printf("No matched flows\n");
		exit(0);
	}

	if (aggregate || print_order) {
		if ( wfile ) {
			nffile_t *nffile = OpenNewFile(wfile, NULL, compress, is_anonymized, NULL);
			if ( !nffile ) 
				exit(255);
			if ( ExportFlowTable(nffile, aggregate, bidir, date_sorted, extension_map_list) ) {
				CloseUpdateFile(nffile, Ident );	
			} else {
				CloseFile(nffile);
				unlink(wfile);
			}
			DisposeFile(nffile);
		} else {
			PrintFlowTable(print_record, topN, do_tag, GuessDir, extension_map_list);
		}
	}

	if (flow_stat) {
		PrintFlowStat(record_header, print_record, topN, do_tag, quiet, csv_output, extension_map_list);
#ifdef DEVEL
		printf("Loopcnt: %u\n", loopcnt);
#endif
	} 

	if (element_stat) {
		PrintElementStat(&sum_stat, plain_numbers, record_header, print_record, topN, do_tag, quiet, pipe_output, csv_output);
	} 

	if ( !quiet ) {
		if ( csv_output ) {
			PrintSummary(&sum_stat, plain_numbers, csv_output);
		} else if ( !wfile ) {
			if (is_anonymized)
				printf("IP addresses anonymised\n");
			PrintSummary(&sum_stat, plain_numbers, csv_output);
			if ( t_last_flow == 0 ) {
				// in case of a pre 1.6.6 collected and empty flow file
 				printf("Time window: <unknown>\n");
			} else {
 				printf("Time window: %s\n", TimeString(t_first_flow, t_last_flow));
			}
			printf("Total flows processed: %u, Blocks skipped: %u, Bytes read: %llu\n", 
				total_flows, skipped_blocks, (unsigned long long)total_bytes);
			nfprof_print(&profile_data, stdout);
		}
	}

	Dispose_FlowTable();
	Dispose_StatTable();
	FreeExtensionMaps(extension_map_list);

#ifdef DEVEL
	if ( hash_hit || hash_miss )
		printf("Hash hit: %i, miss: %i, skip: %i, ratio: %5.3f\n", hash_hit, hash_miss, hash_skip, (float)hash_hit/((float)(hash_hit+hash_miss)));
#endif

	return 0;
}
예제 #25
0
//---------------------------------------------
//mining frequent itemsets from database
//---------------------------------------------
void MineAllPats()
{
	ITEM_COUNTER* pfreqitems;
	int *pitem_order_map;
	double ntotal_occurrences;

	clock_t start, start_mining;;

	int i;
	if(goparameters.bresult_name_given)
		gpfsout = new FSout(goparameters.pat_filename);

	start_mining = clock();

	gntotal_patterns = 0;
	gnmax_pattern_len = 0;
	gntotal_singlepath = 0;

	gndepth = 0;
	gntotal_call = 0;
	gpdata = new Data(goparameters.data_filename);

	//count frequent items in original database
	pfreqitems = NULL;
	start = clock();
	goDBMiner.ScanDBCountFreqItems(&pfreqitems);
	//goTimeTracer.mdInitial_count_time = (double)(clock()-start)/CLOCKS_PER_SEC;

	gppat_counters = new int[gnmax_trans_len];
	//goMemTracer.IncBuffer(gnmax_trans_len*sizeof(int));
	for(i=0;i<gnmax_trans_len;i++)
		gppat_counters[i] = 0;

	if(gntotal_freqitems==0)
		delete gpdata;
	else if(gntotal_freqitems==1)
	{
		gnmax_pattern_len = 1;
		OutputOnePattern(pfreqitems[0].item, pfreqitems[0].support);
		delete gpdata;
	}
	else if(gntotal_freqitems>1)
	{
		gpheader_itemset = new int[gntotal_freqitems];
		//goMemTracer.IncBuffer(gntotal_freqitems*sizeof(int));

		//an array which maps an item to its frequency order; the most infrequent item has order 0
		pitem_order_map = new int[gnmax_item_id];
		for(i=0;i<gnmax_item_id;i++)
			pitem_order_map[i] = -1;
		for(i=0;i<gntotal_freqitems;i++)
			pitem_order_map[pfreqitems[i].item] = i;

		//if the number of frequent items in a conditonal database is smaller than maximal bucket size, use bucket counting technique
		if(gntotal_freqitems <= MAX_BUCKET_SIZE)
		{
			gpbuckets = new int[(1<<gntotal_freqitems)];
			memset(gpbuckets, 0, sizeof(int)*(1<<gntotal_freqitems));
			goDBMiner.ScanDBBucketCount(pitem_order_map, gpbuckets);
			bucket_count(gpbuckets, gntotal_freqitems, pfreqitems);
			delete []gpbuckets;
			delete gpdata;
			delete []pitem_order_map;
		}
		else
		{
			HEADER_TABLE pheader_table;

			goAFOPTMiner.Init(MIN(gnmax_trans_len, gntotal_freqitems+1));
			gpbuckets = new int[(1<<MAX_BUCKET_SIZE)];
			//goMemTracer.IncBuffer(sizeof(int)*(1<<MAX_BUCKET_SIZE));

			//initialize header table
			start = clock();
			pheader_table = new HEADER_NODE[gntotal_freqitems];
			//goMemTracer.IncBuffer(sizeof(HEADER_NODE)*gntotal_freqitems);
			ntotal_occurrences = 0;
			for(i=0;i<gntotal_freqitems;i++)
			{
				pheader_table[i].item = pfreqitems[i].item;
				pheader_table[i].nsupport = pfreqitems[i].support;
				pheader_table[i].parray_conddb = NULL;
				pheader_table[i].order = i;
				ntotal_occurrences += pfreqitems[i].support;
			}

			//to choose a proper representation format for each conditional database
			if((double)goparameters.nmin_sup/gndb_size>=BUILD_TREE_MINSUP_THRESHOLD || 
				ntotal_occurrences/(gndb_size*gntotal_freqitems)>=BUILD_TREE_AVGSUP_THRESHOLD ||
				gntotal_freqitems<=BUILD_TREE_ITEM_THRESHOLD )
			{
				for(i=0;i<gntotal_freqitems;i++)
					pheader_table[i].flag = AFOPT_FLAG;
			}
			else
			{
				for(i=0;i<gntotal_freqitems-BUILD_TREE_ITEM_THRESHOLD;i++)
					pheader_table[i].flag = 0;
				for(i=MAX(0, gntotal_freqitems-BUILD_TREE_ITEM_THRESHOLD);i<gntotal_freqitems;i++)
					pheader_table[i].flag = AFOPT_FLAG;
			}

			//scan database to construct conditional databases and do bucket counting
			memset(gpbuckets, 0, sizeof(int)*(1<<MAX_BUCKET_SIZE));
			goDBMiner.ScanDBBuildCondDB(pheader_table, pitem_order_map, gntotal_freqitems, gpbuckets);
			//goTimeTracer.mdInitial_construct_time = (double)(clock()-start)/CLOCKS_PER_SEC;
			//goMemTracer.mninitial_struct_size = //goMemTracer.mnArrayDB_size+//goMemTracer.mnAFOPTree_size+sizeof(int)*(1<<MAX_BUCKET_SIZE);			
			bucket_count(gpbuckets, MAX_BUCKET_SIZE, &(pfreqitems[gntotal_freqitems-MAX_BUCKET_SIZE]));

			delete []pitem_order_map;
			delete gpdata;
			
			//mining frequent itemsets in depth first order
			if((double)goparameters.nmin_sup/gndb_size>=BUILD_TREE_MINSUP_THRESHOLD ||
				ntotal_occurrences/(gndb_size*gntotal_freqitems)>=BUILD_TREE_AVGSUP_THRESHOLD ||
				gntotal_freqitems<=BUILD_TREE_ITEM_THRESHOLD)
				goAFOPTMiner.DepthAFOPTGrowth(pheader_table, gntotal_freqitems, 0);
			else
			{
				goArrayMiner.DepthArrayGrowth(pheader_table, gntotal_freqitems);
				goAFOPTMiner.DepthAFOPTGrowth(pheader_table, gntotal_freqitems, gntotal_freqitems-BUILD_TREE_ITEM_THRESHOLD);
			}

			delete []pheader_table;
			//goMemTracer.DecBuffer(gntotal_freqitems*sizeof(HEADER_NODE));
			delete []gpbuckets;
			//goMemTracer.DecBuffer(sizeof(int)*(1<<MAX_BUCKET_SIZE));

		}
		delete []gpheader_itemset;
		//goMemTracer.DecBuffer(gntotal_freqitems*sizeof(int));
	}
	delete []pfreqitems;
	//goMemTracer.DecBuffer(gntotal_freqitems*sizeof(ITEM_COUNTER));
	
	if(goparameters.bresult_name_given)
		delete gpfsout;
	//goTimeTracer.mdtotal_running_time = (double)(clock()-start_mining)/CLOCKS_PER_SEC;

	PrintSummary();
	delete []gppat_counters;
	//goMemTracer.DecBuffer(gnmax_trans_len*sizeof(int));
}