/** * Insert the record whose contents is pointed at by recPtr into relation relNum. * * @param relNum - Relation number * @param recPtr - A pointer to a record-sized byte array whose contents * will be copied to an empty record slot in the relation. * @return OK or NOTOK * * @author nithin * * GLOBAL VARIABLES MODIFIED: * g_Buffer[relNum] * g_CatCache[relNum] * * ERRORS REPORTED: * NULL_ARGUMENT_RECEIVED * DUPLICATE_TUPLE * * ALGORITHM: * 1. Check if the record pointed by recPtr already exists in the relation * (only if the check duplicates flag is set) * 2. Find the first free slot in the relation by linear search * 3. Copy the contents into that slot * (using a loop and not strcpy) * 4. Update the dirty bit and slotmap in Buffer * 5. Update the dirty bit, numRecs and numPgs in CatCache * * IMPLEMENTATION NOTES: * Uses ReadPage() * * */ int InsertRec(const int relNum, char*recPtr) { if (recPtr == NULL) { return ErrorMsgs(NULL_ARGUMENT_RECEIVED, g_PrintFlag); } /* Checking for duplicates */ Rid *fRid, sRid = { 0, 0 }; char *record; while (GetNextRec(relNum, &sRid, &fRid, &record) == OK && g_CheckDuplicateTuples == OK) { if (compareRecords(record, recPtr, g_CatCache[relNum].recLength) == OK) { return ErrorMsgs(DUPLICATE_TUPLE, g_PrintFlag); } sRid = *fRid; free(fRid); } Rid startRid = { 1, 0 }, foundRid; /* Insert record */ getNextFreeSlot(relNum, startRid, &foundRid); ReadPage(relNum, foundRid.pid); unsigned int recLength = g_CatCache[relNum].recLength; int i, j; int offset = (foundRid.slotnum - 1) * recLength; for (i = offset, j = 0; j < recLength; ++i, j++) { g_Buffer[relNum].page.contents[i] = recPtr[j]; } /* Update dirty bits and slotmap*/ g_Buffer[relNum].dirty = TRUE; g_Buffer[relNum].page.slotmap = (g_Buffer[relNum].page.slotmap | 1 << (32 - foundRid.slotnum)); /* Update numRecs in catCache*/ g_CatCache[relNum].dirty = TRUE; g_CatCache[relNum].numRecs++; g_CatCache[relNum].numPgs = g_CatCache[relNum].numPgs > foundRid.pid ? g_CatCache[relNum].numPgs : foundRid.pid; return OK; }
void MergeJoin (char *infile1, char *infile2, unsigned char field, block_t *buffer, unsigned int nmem_blocks, char *outfile, unsigned int *nres, unsigned int *nios){ //memSize of buffer is -2 cause 2 last blocks is used for, one reading block from big file and two writing to output int memSize = nmem_blocks - 2; //get sizes of files int infile1Size = getSize(infile1); int infile2Size = getSize(infile2); unsigned int noneed1=0, noneed2=0, ios=0; *nres = 0; *nios = 0; FILE *out = fopen(outfile, "ab"); char outfile1[] = "outfile1.bin"; char outfile2[] = "outfile2.bin"; MergeSort(infile1, 1, buffer, nmem_blocks, outfile1, &noneed1, &noneed2, &ios); (*nios) += ios; MergeSort(infile2, 1, buffer, nmem_blocks, outfile2, &noneed1, &noneed2, &ios); (*nios) += ios; //if file1 is bigger switch files cause next we assume that file1 is the small one. if(infile1Size > infile2Size){ char temp0 = outfile1[7]; outfile1[7] = outfile2[7]; outfile2[7] = temp0; int temp = infile1Size; infile1Size = infile2Size; infile2Size = temp; } FILE *input1 = fopen(outfile1, "rb"); FILE *input2 = fopen(outfile2, "rb"); ////printfile(outfile1); ////printfile(outfile2); block_t *bigFileBlock = buffer + memSize; block_t *outputBlock = buffer + memSize + 1; (*outputBlock).blockid = 0; //offset that changes every time we need new block from big file to check int bigFileBlockOffset=0; //counts the records of buffer that has same value of a record of bigFileBlock int countSameBufferEntries=0; //number of blocks in big file. Useful to end the main loop. int blocks = infile2Size - 1; //printf("%d", blocks); //this will represent the id of the first block of the buffer int firstBlockId = 0; int lastBlockId = memSize-1; //at first we read first blocks of small and big file so we have something to compare in first loop (*nios) += readBuffer(buffer, input1, 0, memSize); (*nios) += readBlock(bigFileBlock, input2, 0); recordPos bufferRecPos = getRecordPos(0); record_t tempBufferRec; int tempBufferBlock=0; record_t bigFileBlockRec; while(blocks>0){ /* General: if(bufferRecPos.block%memSize==firstBlockId%memSize) this condition checks if we have reached i circle in the buffer. firstBlockId%memSize defines the first block of the buffer and makes it easy to replace it with another block if necessary with: buffer + firstBlockId%memSize */ for(int blockEntrie=0; blockEntrie<MAX_RECORDS_PER_BLOCK; blockEntrie++){ bigFileBlockRec = (*bigFileBlock).entries[blockEntrie]; if(compareRecords(tempBufferRec, bigFileBlockRec, field)==0){ //Here we have to go back to the block and record of tempBufferRec. for(int i=0; i<countSameBufferEntries; i++){ decr(bufferRecPos); ////printf("%d\n", bufferRecPos.block); } int i=0;//keeps i for load2 int load, load2; if(countSameBufferEntries/memSize > memSize){ load = memSize; load2=0; }else{ load = countSameBufferEntries/memSize; load2 = countSameBufferEntries%memSize; } for( ;i<load; i++){ (*nios) += readBlock(buffer + (tempBufferBlock + i) % memSize, input1, tempBufferBlock + i); } if(load2!=0){//we have to read one more block. (*nios) += readBlock(buffer + (tempBufferBlock + i) % memSize, input1, tempBufferBlock + i); } //return to firstBlockID and lastBlockID their previous values firstBlockId = tempBufferBlock ; lastBlockId = firstBlockId + memSize - 1; } while(compareRecords(getRecord(buffer, bufferRecPos), bigFileBlockRec, field) < 0){ //Here we have to pass the records in the small file that are smaller than the bigFileRec incr(bufferRecPos); if (bufferRecPos.record == 0) { if(bufferRecPos.block%memSize==firstBlockId%memSize){ if (lastBlockId < infile1Size - 1) { (*nios) += readBlock(buffer + firstBlockId%memSize, input1, lastBlockId + 1); firstBlockId += 1; lastBlockId += 1; }else{ blocks=0;//No point to continue merging as all next records are greater than the last of buffer. break; } } } } if(compareRecords(getRecord(buffer, bufferRecPos), bigFileBlockRec, field) > 0){ continue;//... } tempBufferRec = getRecord(buffer, bufferRecPos); tempBufferBlock = bufferRecPos.block; countSameBufferEntries=0; while(compareRecords(getRecord(buffer, bufferRecPos), bigFileBlockRec, field)==0){ //Here we add in output the merges. (*outputBlock).entries[(*outputBlock).nreserved++] = bigFileBlockRec; (*outputBlock).entries[(*outputBlock).nreserved++] = getRecord(buffer, bufferRecPos); (*nres)++; if ((*outputBlock).nreserved == MAX_RECORDS_PER_BLOCK) { (*nios) += writeBlock(out, outputBlock); emptyBlock(outputBlock); (*outputBlock).blockid += 1; } countSameBufferEntries++; incr(bufferRecPos); if(bufferRecPos.record==0){ if(bufferRecPos.block%memSize==firstBlockId%memSize){ if (lastBlockId < infile1Size - 1) { (*nios) += readBlock(buffer + firstBlockId%memSize, input1, lastBlockId + 1); firstBlockId++; lastBlockId++; }else {//take always the same value because it is the last value to compare with last big file's blocks if (bufferRecPos.block == 0) { bufferRecPos.block = memSize - 1; } else { bufferRecPos.block -= 1; } bufferRecPos.record = MAX_RECORDS_PER_BLOCK - 1; break; } } } ////printf("fsdfds"); } } //records in bigFileBlock are over and we read the next one. bigFileBlockOffset++; blocks--; (*nios) += readBlock(bigFileBlock, input2, bigFileBlockOffset); } }
uint mergeElimination(int &input, int &output, block_t *buffer, uint memSize, uint segsToMerge, uint *blocksLeft, uint segmentSize, uint firstSegOffset, unsigned char field, bool lastPass, bool lastMergeOfPass, uint *nunique) { uint ios = 0; block_t *bufferOut = buffer + memSize; uint blocksWritten = 0; uint sizeOfLastSeg; if (lastMergeOfPass) { sizeOfLastSeg = blocksLeft[segsToMerge - 1] + 1; } // holds the last unique value written to the output record_t *lastRecordAdded = NULL; recordPtr *nextRecord = (recordPtr*) malloc(segsToMerge * sizeof (recordPtr)); for (uint i = 0; i < segsToMerge; i++) { nextRecord[i].block = i; nextRecord[i].record = 0; } emptyBlock(bufferOut); (*bufferOut).blockid = 0; uint segsToMergeCopy = segsToMerge; while (segsToMergeCopy != 0) { uint i; for (i = 0; i < segsToMerge; i++) { if (buffer[i].valid) { break; } } record_t minRec = getRecord(buffer, nextRecord[i]); uint minBuffIndex = i; for (uint j = i + 1; j < segsToMerge; j++) { if (buffer[j].valid && compareRecords(getRecord(buffer, nextRecord[j]), minRec, field) < 0) { minRec = getRecord(buffer, nextRecord[j]); minBuffIndex = j; } } if (!lastPass) { (*bufferOut).entries[(*bufferOut).nreserved++] = minRec; } else { if (!lastRecordAdded) { (*bufferOut).entries[(*bufferOut).nreserved++] = minRec; (*nunique) += 1; lastRecordAdded = (record_t*) malloc(sizeof (record_t)); memcpy(lastRecordAdded, &minRec, sizeof (record_t)); } else { if (compareRecords(*lastRecordAdded, minRec, field) != 0) { (*bufferOut).entries[(*bufferOut).nreserved++] = minRec; (*nunique) += 1; memcpy(lastRecordAdded, &minRec, sizeof (record_t)); } } } if ((*bufferOut).nreserved == MAX_RECORDS_PER_BLOCK) { ios += writeBlocks(output, bufferOut, 1); (*bufferOut).blockid += 1; blocksWritten += 1; emptyBlock(bufferOut); } incr(nextRecord[minBuffIndex]); if (nextRecord[minBuffIndex].record == 0) { nextRecord[minBuffIndex].block -= 1; if (blocksLeft[minBuffIndex] > 0) { uint blockOffset; if (lastMergeOfPass && minBuffIndex == segsToMerge - 1) { blockOffset = firstSegOffset + segmentSize * minBuffIndex + sizeOfLastSeg - blocksLeft[minBuffIndex]; } else { blockOffset = firstSegOffset + segmentSize * minBuffIndex + segmentSize - blocksLeft[minBuffIndex]; } ios += preadBlocks(input, buffer + minBuffIndex, blockOffset, 1); blocksLeft[minBuffIndex] -= 1; if (!buffer[minBuffIndex].valid) { segsToMergeCopy -= 1; } } else { buffer[minBuffIndex].valid = false; segsToMergeCopy -= 1; } } else { if (!getRecord(buffer, nextRecord[minBuffIndex]).valid) { buffer[minBuffIndex].valid = false; segsToMergeCopy -= 1; } } } free(nextRecord); if (lastRecordAdded) { free(lastRecordAdded); } if ((*bufferOut).nreserved != 0) { ios += writeBlocks(output, bufferOut, 1); (*bufferOut).blockid += 1; blocksWritten += 1; } if (!lastPass && !lastMergeOfPass) { for (uint i = 0; i < segmentSize * segsToMerge - blocksWritten; i++) { ios += writeBlocks(output, buffer, 1); } } return ios; }
/* * infile: input filename * size: size in blocks of input file * outfile: output filename * field: which field will be used for sorting * buffer: the buffer that is used * memSize: number of buffer blocks available for use, without counting the last one, which is for output * nunique: number of unique values * nios: number of ios * * when the input file fits the buffer and there's still a block available for output, * hashes each record and writes it to the output, if a record of same value is not * found on the corresponding bucket. */ void hashElimination(char *infile, uint size, char *outfile, unsigned char field, block_t *buffer, uint memSize, uint *nunique, uint *nios) { int out = open(outfile, O_WRONLY | O_CREAT | O_TRUNC, S_IRWXU); block_t *bufferOut = buffer + memSize; emptyBlock(bufferOut); (*bufferOut).valid = true; (*bufferOut).blockid = 0; (*nunique) = 0; (*nios) += readBlocks(infile, buffer, size); // creates a hash index. for each value returned from the hash function, // there is a linkedList of pointers to the records with that specific hash // value uint hashSize = size*MAX_RECORDS_PER_BLOCK; linkedRecordPtr **hashIndex = (linkedRecordPtr**) malloc(hashSize * sizeof (linkedRecordPtr*)); for (uint i = 0; i < hashSize; i++) { hashIndex[i] = NULL; } recordPtr start = newPtr(0); recordPtr end = newPtr(size * MAX_RECORDS_PER_BLOCK - 1); for (; start <= end; incr(start)) { if (!buffer[start.block].valid) { start.record = MAX_RECORDS_PER_BLOCK - 1; continue; } record_t record = getRecord(buffer, start); if (record.valid) { // hashes the record being examined uint index = hashRecord(infile, record, hashSize, field); linkedRecordPtr *element = hashIndex[index]; // goes through the linked list for the hash value of the record // if a record with same value is not found, then a recordPtr is // added to the linked list and the record itself is written to // the output. otherwise, it is ignored. while (element) { if (compareRecords(record, getRecord(buffer, element->ptr), field) == 0) { break; } element = element->next; } if (!element) { element = (linkedRecordPtr*) malloc(sizeof (linkedRecordPtr)); element->ptr = start; element->next = hashIndex[index]; hashIndex[index] = element; (*bufferOut).entries[(*bufferOut).nreserved++] = record; (*nunique) += 1; if ((*bufferOut).nreserved == MAX_RECORDS_PER_BLOCK) { (*nios) += writeBlocks(out, bufferOut, 1); emptyBlock(bufferOut); (*bufferOut).blockid += 1; } } } } // writes records left in buffer to the outfile if ((*bufferOut).nreserved != 0) { (*nios) += writeBlocks(out, bufferOut, 1); } destroyHashIndex(hashIndex, size); close(out); }
/* * infile: filename of the input file * outfile: filename of the output file * field: which field will be used for sorting * buffer: the buffer used * nmem_blocks: size of buffer * nunique: number of unique values * nios: number of ios * * when the input file size is equal to buffer, the whole file is loaded and * sorted. then the first block is used as output where only unique values are * written */ void useFirstBlock(char *infile, char *outfile, unsigned char field, block_t *buffer, uint nmem_blocks, uint *nunique, uint *nios) { int out = open(outfile, O_WRONLY | O_CREAT | O_TRUNC, S_IRWXU); (*nios) += readBlocks(infile, buffer, nmem_blocks); if (sortBuffer(buffer, nmem_blocks, field)) { // all the unique values of the first block are shifted to the start // of it. the rest are marked as invalid recordPtr i = newPtr(1); recordPtr j = newPtr(1); (*nunique) += 1; buffer[0].nreserved = 1; for (; j.block < 1; incr(j)) { record_t record = getRecord(buffer, j); if (record.valid && compareRecords(record, getRecord(buffer, i - 1), field) != 0) { setRecord(buffer, record, i); (*nunique) += 1; incr(i); buffer[0].nreserved += 1; } } j = newPtr(i, 0); for (; j.block < 1; incr(j)) { buffer[j.block].entries[j.record].valid = false; } record_t *lastRecordAdded = (record_t*) malloc(sizeof (record_t)); record_t lastUnique = getRecord(buffer, i - 1); memcpy(lastRecordAdded, &lastUnique, sizeof (record_t)); // if the first block is full after the shifting (meaning that all its // values were actually unique), writes it to the outfile and empties it if (buffer[0].nreserved == MAX_RECORDS_PER_BLOCK) { i.block -= 1; (*nios) += writeBlocks(out, buffer, 1); emptyBlock(buffer); buffer[0].blockid += 1; } // write the unique values of the other blocks to the first one. if it // becomes full writes it to outfile and empties it. at the end, if it // has records not writtend yet, writes them to the outfile as well. j = newPtr(MAX_RECORDS_PER_BLOCK); while (buffer[j.block].valid && j.block < nmem_blocks) { record_t record = getRecord(buffer, j); if (!record.valid) { break; } if (compareRecords(record, (*lastRecordAdded), field) != 0) { setRecord(buffer, record, i); memcpy(lastRecordAdded, &record, sizeof (record_t)); (*nunique) += 1; incr(i); buffer[0].nreserved += 1; } if (buffer[0].nreserved == MAX_RECORDS_PER_BLOCK) { i.block -= 1; (*nios) += writeBlocks(out, buffer, 1); emptyBlock(buffer); buffer[0].blockid += 1; } incr(j); } if (buffer[0].nreserved != 0) { (*nios) += writeBlocks(out, buffer, 1); } free(lastRecordAdded); } close(out); }