-
Notifications
You must be signed in to change notification settings - Fork 2
/
build_index.py
125 lines (114 loc) · 3.96 KB
/
build_index.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import csv
import numpy as np
import pandas as pd
from BTrees.OOBTree import OOBTree
import pickle
import io
import sys
sys.setrecursionlimit(1000000)
def build_index(file_path, filename, idx_path, attr_name, idx_type, multiway):
"""
build index
:param file_path: e.g. 'C:/2017_Fall/CS 411/csv_data/'
:param filename: e.g. 'review.csv'
:param idx_path: e.g. file_path + 'index/'
:param attr_name: e.g. 'stars'
:param idx_type: e.g. 'Hash', 'BTree', 'Location', 'Tag'
:param multiway: True or False, True for join attribute, e.g. 'business_id'
:return:
"""
if idx_type == 'Hash':
idx_name = filename.replace('.csv', attr_name + '.npy')
id_name = filename.replace('.csv', attr_name+'idx.npy')
idx_dict = {}
id_dict = {}
with open(file_path + filename, 'r', encoding='utf8') as myfile:
reader = csv.reader(myfile)
attributes = next(reader)
attr_idx = attributes.index(attr_name)
count = 0
for row in reader:
cur = row[attr_idx]
if cur not in idx_dict:
idx_dict[cur] = list()
idx_dict[cur].append(count)
id_dict[count] = cur
count += 1
myfile.close()
if multiway:
np.save(idx_path + id_name, id_dict)
np.save(idx_path + idx_name, idx_dict)
elif idx_type == 'BTree':
idx_dict = {}
idx_name = filename.replace('.csv', attr_name + '.pkl')
for chunck_df in pd.read_csv(file_path + filename, chunksize=10000, encoding="utf8"):
dic = chunck_df.to_dict()
idx_dict = dict(list(idx_dict.items()) + list(dic[attr_name].items()))
idx_list = {}
for k, v in idx_dict.items():
if v not in idx_list:
idx_list[v] = [k]
else:
idx_list[v].append(k)
BT = OOBTree()
BT.update(idx_list)
with open(idx_path + idx_name, 'wb') as f:
pickle.dump(BT, f, pickle.HIGHEST_PROTOCOL)
elif idx_type == 'Location':
if filename == 'review.csv' or filename == 'photos.csv':
new_od = getloc_r_p(file_path + filename)
elif filename == 'business.csv' or filename == 'checkin.csv':
new_od = getloc_b_c(file_path + filename)
else:
new_od = getloc_r_p(file_path + filename)
idx_name = filename.replace('.csv', 'loc.npy')
np.save(idx_path + idx_name, new_od)
elif idx_type == 'Tag':
myfile = open(file_path + filename, 'r', encoding='utf8')
reader = csv.reader(myfile)
tag = next(reader)
myfile.close()
idx_name = filename.replace('.csv', 'tag.npy')
np.save(idx_path + idx_name, tag)
def getloc_b_c(fname):
f = open(fname, "r", encoding='utf8')
od = []
while 1:
od.append(f.tell())
line = f.readline()
if not line:
break
newod = od[0:len(od) - 1]
return newod
def getloc_r_p(fname):
# fname is a string. ex:"review-5k.csv"
f = io.open(fname, encoding="utf8", newline="\r\n")
od = []
while 1:
od.append(f.tell())
line = f.readline()
if not line:
break
f.close()
newod = od[0:len(od) - 1]
flag = [0] * len(newod)
f = open(fname, "r", encoding='utf8')
reader = csv.reader(f)
row = next(reader)
for i in range(len(newod)):
fcopy = open(fname, "r", encoding='utf8')
fcopy.seek(newod[i])
xreader = csv.reader(fcopy)
tmp = next(xreader)
fcopy.close()
if row != tmp:
flag[i] = 1
continue
elif i != len(newod) - 1:
row = next(reader)
f.close()
finalod = []
for i in range(len(newod)):
if flag[i] == 0:
finalod.append(newod[i])
return finalod