-
Notifications
You must be signed in to change notification settings - Fork 0
/
zipf.cpp
152 lines (128 loc) · 3.78 KB
/
zipf.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
/**
* Programming Assignment #3: Zipf's Law
* CSC300 - Data Structures - Fall 2015
* Dr. John Weiss
*
* Authors: Lucas Carpenter, Chrissy Sorensen
*
* Compile: g++ zipf.cpp hashTable.cpp -std=c++11 -o zipf
*
* Usage: ./zipf <filename>.txt
*
**/
/**
* Includes
**/
#include <iostream>
#include <fstream>
#include <string>
#include <algorithm>
#include <ctime>
#include "hashTable.h"
using namespace std;
/**
* Global Variables
**/
const char* valid = "abcdefghijklmnopqrstuvwxyz";
const char* valid2 = "abcdefghijklmnopqrstuvwxyz'";
/**
* Prototypes
**/
void fileHandle ( char* fileName, hashTable &h );
string tokenize ( string str );
/**
* Functions
**/
/**
* main(int, char*), the starting point of our program, reads in command
* line args and passes them off to other functions or prints
* out a usage statement
**/
int main ( int argc, char* argv[] )
{
hashTable h;
switch ( argc )
{
//if 2 args, pass to file handling and continue program
case 2:
fileHandle ( argv[1], h );
break;
//if more or less than 2 args print out usage statement
default:
cout << "Usage: zipf filename\n";
return 1;
}
return 0;
}
/**
* fileHandle(char*, hashTable&), reads in a file and passes each word of the file
* onto a tokenize function and inserts the word into a hashtable
**/
void fileHandle ( char* fileName, hashTable &h )
{
ifstream in;
in.open ( fileName, ios::in );
if ( !in )
{
cout << "Unable to open " << fileName << ". Exiting program.\n";
return;
}
string x;
auto c1 = clock();
while ( in >> x )
{
x = tokenize ( x );
// we would add into a hash table here..
// should only need to add in, in the insert function for
// our hash table it should be able to handle everything else
if ( x != "" && !h.insert ( x ) )
{
cout << "Unable to insert " << x << " into hashtable. Exiting program\n";
return;
}
}
cout << "Finished generating a hash table of size " << h.getSize() << ".\n";
cout << "Read " << h.getNumWords() << " words from the file " << fileName << ".\n";
cout << "Inserted " << h.getNumDistinct() << " distinct words into the hash table.\n";
cout << "Compacting and sorting the hash table ... ";
h.sort();
cout << "finished!\n";
auto c2 = clock();
auto totalTime = c2 - c1;
cout << fixed << showpoint;
cout << "Elapsed time = " << setprecision ( 1 ) << totalTime / ( CLOCKS_PER_SEC / 1000.0 ) << " msec.\n";
h.printStats ( fileName );
}
/**
* tokenize(string), takes in an input string and makes all chars lowercase, then
* checks for valid chars throughout the string and returns out the newly
* modified string. This function is a modification of Dr. Weiss's tokenize2
* code.
**/
string tokenize ( string str )
{
// this will lowercase the string
transform ( str.begin(), str.end(), str.begin(), ::tolower );
// skip delimiters to start of first token
int tokenStart = str.find_first_of ( valid, 0 );
// find next delimiter (i.e., end of first token)
int tokenEnd = str.find_first_not_of ( valid2, tokenStart );
// if tokenstart == -1, no valid chars were found in the input string
if ( tokenStart == -1 )
{
return "";
}
// since tokenEnd will return a -1 if no invalid char is found
// we need to change it to the length of the string
if ( tokenEnd == -1 )
{
tokenEnd = str.size();
}
// remove beginning ' or end '
if ( str[tokenEnd - 1] == '\'' )
tokenEnd--;
// substring our word from tokenStart to tokenEnd
str = str.substr ( tokenStart, ( tokenEnd - tokenStart ) );
//output our newly modified string
return str;
}