Skip to content

Commit 4e9666b

Browse files
author
Nitish Gupta
committed
changes in Java project for Data-Preprocessing (yelpDataProcessing package). Mods in PythonScripts for generating Dataset folders and processing Reviews
1 parent 5d1e076 commit 4e9666b

23 files changed

+571
-85
lines changed

Diff for: JSON-JAR/javax.json-1.0.4.jar

83.2 KB
Binary file not shown.

Diff for: Project/bin/logisticCMF/codeTest.class

-179 Bytes
Binary file not shown.
2.77 KB
Binary file not shown.
978 Bytes
Binary file not shown.
-2.48 KB
Binary file not shown.

Diff for: Project/bin/yelpDataProcessing/ProcessYelpJson.class

25 Bytes
Binary file not shown.

Diff for: Project/src/logisticCMF/codeTest.java

+11-9
Original file line numberDiff line numberDiff line change
@@ -634,7 +634,7 @@ public static void performHeldOutEvaluation(String folder) throws IOException{
634634
}
635635

636636
// To test one dataset completely and write embeddings for (A + R + C + W)
637-
public static void main(String [] args) throws Exception {
637+
/*public static void main(String [] args) throws Exception {
638638
String folder = args[0];
639639
String todo = args[1];
640640
todo = "heldOut";
@@ -650,27 +650,29 @@ public static void main(String [] args) throws Exception {
650650
performRateUserColdEvaluation(folder);
651651
//attBusColdEvaluations(folder);
652652
//rateBusColdEvaluations(folder);
653-
}
653+
}*/
654+
655+
654656

655657
// To make the sizes table
656-
/*public static void main(String [] args) throws Exception {
658+
public static void main(String [] args) throws Exception {
657659

658-
String folder = "WI";
659-
data A = readAttributes(folder, 15.0, 15.0, false, 0);
660+
String folder = "EDH";
661+
data A = readAttributes(folder, 0.0, 0.0, false, 0);
660662
//data C = readCategories(folder, 5);
661-
data R = readRatings(folder, 0.0, 0.0, false, 1);
663+
//data R = readRatings(folder, 0.0, 0.0, false, 1);
662664
//data BW = readReviewData(folder, 10, true, false, 0.0, 0.0);
663665
//data UW = readReviewData(folder, 10, false, true, 0.0, 0.0);
664666

665667
A.dataStats();
666668

667669
Util.getMatrixDetails(A);
668-
Util.getMatrixDetails(C);
669-
Util.getMatrixDetails(R);
670+
//Util.getMatrixDetails(C);
671+
// //Util.getMatrixDetails(R);
670672
//Util.getMatrixDetails(BW);
671673

672674

673675

674-
}*/
676+
}
675677

676678
}
+41
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
package postProcessing;
2+
3+
import java.io.IOException;
4+
import java.util.*;
5+
6+
public class QualitativeEvaluation {
7+
8+
public static void main(String [] args) throws IOException {
9+
String folder = "AZ";
10+
String evaluation = "HeldOut";
11+
System.out.println("Start");
12+
13+
EntityEmbeddings attributes = new EntityEmbeddings(folder, evaluation+"/"+"attributes-bw", 30);
14+
EntityEmbeddings words = new EntityEmbeddings(folder, evaluation+"/"+"words-bw", 30);
15+
EntityEmbeddings categories = new EntityEmbeddings(folder, evaluation+"/"+"categories-bw", 30);
16+
EntityEmbeddings business = new EntityEmbeddings(folder, evaluation+"/"+"business-bw", 30);
17+
18+
Similarity s = new Similarity();
19+
s.getSimilarity(categories, words, 5);
20+
s.printSimMap();
21+
22+
String folderToRead = "CatAttWord/";
23+
String entitiesToRead = "categories.txt";
24+
String entityReadPath = System.getProperty("user.dir")+"/../Embeddings_Prediction_Data/Qualitative/"+ folderToRead + entitiesToRead;
25+
Set<String> e1sToWrite = Util.readEntitiesForTSNE(entityReadPath);
26+
Set<String> e2sToWrite = Util.getKNNEntities(e1sToWrite, s);
27+
System.out.println(e2sToWrite.size());
28+
//System.out.println(entitiesToWrite);
29+
30+
String writePath = System.getProperty("user.dir")+"/../Embeddings_Prediction_Data/Qualitative/"+ folderToRead;
31+
Util.writeEmbeddingsForSet(e1sToWrite, categories, writePath + "catEmbeddings.txt");
32+
Util.writeEmbeddingsForSet(e2sToWrite, words, writePath + "wordEmbeddings.txt");
33+
Util.writeSimilarEntities(e1sToWrite, s.simMap, writePath + "catWords.txt");
34+
35+
36+
37+
38+
39+
40+
}
41+
}
978 Bytes
Binary file not shown.
8.45 KB
Binary file not shown.

Diff for: Project/src/yelpDataProcessing/AttributeCategory.java

+11-21
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ private void buildCategorySet(String folder) throws IOException {
7373
}
7474
}
7575
//return categories;
76-
System.out.println("Businesses Read : " + count);
76+
//System.out.println("Businesses Read : " + count);
7777
}
7878

7979
private void buildThresholdCatSet(int thresh){
@@ -135,7 +135,7 @@ private void buildBusiness_AttributeDataset(String folder) throws IOException{
135135
}
136136
count++;
137137
}
138-
System.out.println("No. of Business : "+count);
138+
//System.out.println("No. of Business : "+count);
139139
br.close();
140140
bw.close();
141141
}
@@ -280,30 +280,20 @@ private void makeCitySet(String dataset) throws IOException {
280280

281281

282282
public static void main(String[] args) throws Exception{
283-
System.out.println("Hello");
284-
String State = "NV";
283+
String State = "ON";
284+
String [] folders = {"ON", "AZ", "EDH", "WI", "NV", "complete"};
285285

286286
AttributeCategory data = new AttributeCategory();
287287

288288

289289
// Read file for attributes and write to a file
290-
data.readAttributes(State);
291-
//data.printAttributes();
292-
//data.buildBusiness_AttributeDataset(State);
293-
294-
295-
data.buildCategorySet(State);
296-
data.printCategories();
297-
//data.writeResCatToFile(State);
298-
299-
int c = 0;
300-
301-
302-
//data.makeCitySet("yelp_dataset");
303-
304-
305-
306-
290+
for(String state : folders){
291+
System.out.println("Processing "+state);
292+
data.readAttributes(state);
293+
data.buildBusiness_AttributeDataset(state);
294+
data.buildCategorySet(state);
295+
data.writeResCatToFile(state);
296+
}
307297

308298
}
309299

Diff for: Project/src/yelpDataProcessing/ProcessYelpJson.java

+16-13
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ public void createStateBusinessJson(String folder, String state) throws IOExcept
8383
}
8484
}
8585
bw.close();
86-
System.out.println("No. of Businesses in "+state + " : " + cr);
86+
//System.out.println("No. of Businesses in "+state + " : " + cr);
8787
}
8888

8989

@@ -137,7 +137,7 @@ public void createBusReviewJson(String folder_complete, String folder) throws IO
137137
}
138138
}
139139
bw.close();
140-
System.out.println("Reviews Count : " + count);
140+
//System.out.println("No. of Review in "+ folder +" : " + count);
141141

142142
}
143143

@@ -156,7 +156,7 @@ public void makeBusIdSet(String folder) throws IOException{
156156
busIds.add(bid);
157157
}
158158
}
159-
System.out.println("Size of resIds set :" + busIds.size());
159+
//System.out.println("Size of resIds set :" + busIds.size());
160160
}
161161

162162
public static void putReviewDatatoFile(String folder) throws IOException{
@@ -193,26 +193,29 @@ public static void putReviewDatatoFile(String folder) throws IOException{
193193
}
194194
br.close();
195195
bw.close();
196-
System.out.println("Reviews Written : " + count);
196+
//System.out.println("Reviews Written : " + count);
197197
}
198198

199199
public static void main(String [] args) throws Exception{
200200
String yelpDataset = "yelp_dataset";
201-
String State = "NV";
201+
String State = "ON";
202+
String [] folders = {"ON", "AZ", "EDH", "WI", "NV"};
202203

203204
ProcessYelpJson yelp = new ProcessYelpJson();
204205

205206

206-
//yelp.createCompleteBusinessJson(yelpDataset);
207+
yelp.createCompleteBusinessJson(yelpDataset);
208+
yelp.createCompleteReviewJson(yelpDataset);
209+
yelp.putReviewDatatoFile("complete");
207210

208-
//yelp.createRestaurantJson("complete");
209-
yelp.createStateBusinessJson("complete", State);
210-
211-
//yelp.createCompleteReviewJson(yelpDataset);
212-
213-
yelp.createBusReviewJson("complete", State);
214211

215-
yelp.putReviewDatatoFile(State);
212+
//yelp.createRestaurantJson("complete");
213+
for(String state : folders){
214+
System.out.println("Processing "+state);
215+
yelp.createStateBusinessJson("complete", state);
216+
yelp.createBusReviewJson("complete", state);
217+
yelp.putReviewDatatoFile(state);
218+
}
216219

217220

218221

Diff for: PythonScript/clean_text.py~

+45-20
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ from nltk.tokenize import word_tokenize
1616
import re
1717
import string
1818
from nltk.stem import PorterStemmer
19+
import sys
1920

2021
stemmer = PorterStemmer()
2122

@@ -56,23 +57,47 @@ def getTokens(doc):
5657
return token_no_stop
5758

5859

59-
60-
fin = open('reviews.txt', 'r')
61-
fout = open('reviews_textProc.txt', 'w')
62-
count = 0
63-
for line in fin:
64-
# print count
65-
if(count %100000 == 0):
66-
print count
67-
if(line.strip().split(':')[0] == 'text'):
68-
tokens = getTokens(line.strip().split(':')[1])
69-
tokenSet = set()
70-
for i in tokens:
71-
if (len(i) >= 3):
72-
tokenSet.add(i)
73-
fout.write("text : ")
74-
fout.write(" ".join(tokenSet))
75-
fout.write("\n\n")
76-
else:
77-
fout.write(line)
78-
count = count + 1
60+
def processData(folder):
61+
filePath = "Dataset/data/"+folder
62+
fin = open(filePath+'/reviews.txt', 'r')
63+
fout = open(filePath+'/reviews_textProc.txt', 'w')
64+
count = 0
65+
66+
for i, l in enumerate(fin):
67+
pass
68+
69+
lines = i+1
70+
perc = 1;
71+
fin = open(filePath+'/reviews.txt', 'r')
72+
for line in fin:
73+
if(line.strip().split(':')[0] == 'text'):
74+
tokens = getTokens(line.strip().split(':')[1])
75+
tokenSet = set()
76+
for i in tokens:
77+
if (len(i) >= 3):
78+
tokenSet.add(i)
79+
fout.write("text : ")
80+
fout.write(" ".join(tokenSet))
81+
fout.write("\n\n")
82+
else:
83+
fout.write(line)
84+
count = count + 1
85+
if( (count % (lines/10) == 0) ):
86+
p = str(perc*10) + "% done"
87+
sys.stdout.write("\r"+p)
88+
perc = perc + 1
89+
90+
print
91+
92+
option = sys.argv[1]
93+
folders = ['EDH', 'AZ', 'ON', 'NV', 'WI']
94+
if(option == 'all'):
95+
for folder in folders:
96+
processData(folder)
97+
else:
98+
processData(option)
99+
100+
101+
102+
103+

Diff for: PythonScript/combineAllPredData.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# From Logisitc CMF : python PythonScript/combineAllPredData.py Embeddings_Prediction_Data HeldOut
1+
# From Repo Folder : python PythonScript/combineAllPredData.py Embeddings_Prediction_Data HeldOut
22

33
import sys
44
import os

Diff for: PythonScript/combineAllPredData.py~

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# From Logisitc CMF : python PythonScript/combineAllPredData.py Embeddings_Prediction_Data HeldOut
1+
# From Repo Folder : python PythonScript/combineAllPredData.py Embeddings_Prediction_Data HeldOut
22

33
import sys
44
import os

Diff for: PythonScript/makeDataFolders.md~

Whitespace-only changes.

Diff for: PythonScript/makeDataFolders.py

+31
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
# From Repo Folder : python PythonScript/makeDataFolders.py
2+
import sys
3+
import os
4+
from os import walk
5+
6+
dataset = "Dataset1/"
7+
8+
folders = ['AZ', 'NV', 'WI', 'EDH', 'ON', 'complete']
9+
10+
11+
if not os.path.exists(dataset):
12+
os.makedirs(dataset)
13+
14+
if not os.path.exists(dataset+"data"):
15+
os.makedirs(dataset+"data")
16+
17+
if not os.path.exists(dataset+"json"):
18+
os.makedirs(dataset+"json")
19+
20+
for folder in folders:
21+
if not os.path.exists(dataset+"json/"+folder):
22+
os.makedirs(dataset+"json/"+folder)
23+
24+
25+
26+
for folder in folders:
27+
if not os.path.exists(dataset+"data/"+folder):
28+
os.makedirs(dataset+"data/"+folder)
29+
os.makedirs(dataset+"data/"+folder+"/embeddings")
30+
os.makedirs(dataset+"data/"+folder+"/pred-data")
31+

Diff for: PythonScript/makeDataFolders.py~

+31
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
# From Repo Folder : python PythonScript/makeDataFolders.py
2+
import sys
3+
import os
4+
from os import walk
5+
6+
dataset = "Dataset1/"
7+
8+
folders = ['AZ', 'NV', 'WI', 'EDH', 'ON', 'complete']
9+
10+
11+
if not os.path.exists(dataset):
12+
os.makedirs(dataset)
13+
14+
if not os.path.exists(dataset+"data"):
15+
os.makedirs(dataset+"data")
16+
17+
if not os.path.exists(dataset+"json"):
18+
os.makedirs(dataset+"json")
19+
20+
for folder in folders:
21+
if not os.path.exists(dataset+"json/"+folder):
22+
os.makedirs(dataset+"json/"+folder)
23+
24+
25+
26+
for folder in folders:
27+
if not os.path.exists(dataset+"data/"+folder):
28+
os.makedirs(dataset+"data/"+folder)
29+
os.makedirs(dataset+"data/"+folder+"/embeddings")
30+
os.makedirs(dataset+"data/"+folder+"/pred-data")
31+

0 commit comments

Comments
 (0)