-
Notifications
You must be signed in to change notification settings - Fork 1
/
IndexBuilder.py
88 lines (71 loc) · 2.49 KB
/
IndexBuilder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import json
import math
import re
import os
from collections import OrderedDict
repoDir = "repoData/"
wordFreqFileName = "/wordFrequencies.pfq"
indexDir = "indexData/"
indexDict = {}
def getFileLines(fileName):
tempFile = open(fileName, "r")
tempLines = tempFile.readlines()
tempFile.close()
return tempLines
def mergeIndicesToDict(pfqFileName, docId):
pfqLines = getFileLines(pfqFileName)
for line in pfqLines:
lineSplits = line.strip().split("\t")
if(len(lineSplits) > 0):
term = lineSplits[0].lower()
if('a' <= term[0] <= 'z') and re.match("^[A-Za-z0-9_@.-]+$", term):
termFreq = int(lineSplits[1])
posting = [docId,termFreq,int(lineSplits[2])] #lineSplits[2] is weightage based on if term is in description
if(term in indexDict):
indexDict[term][1].append(posting)
else:
indexDict[term] = [0,[posting]]
def updateTfIdf():
docFreq = 0
N = len(indexDict)
for term in indexDict:
docFreq = len(indexDict[term][1])
indexDict[term][0] = docFreq
for i in range(docFreq):
# modified tfidf score based on text type weightage for that document.
# indexDict[term][1][i][2] = round((1+math.log(indexDict[term][1][i][1],10)) * math.log(N/docFreq,10) * (1+math.log(indexDict[term][1][i][2]+1,2)),1)
indexDict[term][1][i][2] = round((1+math.log(indexDict[term][1][i][1],10)) * math.log(N/docFreq,10) * (indexDict[term][1][i][2]+1),1)
allProjFile = open("projects_meta_data.json")
allProjDict = json.load(allProjFile)
allProjFile.close()
count = 0
for i in range(0,len(allProjDict)):
folderName = allProjDict[i]["full_name"].replace("/", "-")
if(os.path.isfile(repoDir+folderName+wordFreqFileName)):
mergeIndicesToDict(repoDir+folderName+wordFreqFileName,folderName)
else:
print("No PFQ : "+str(i)+ " : "+folderName)
if(count%1000 == 0):
print(count)
count += 1
updateTfIdf()
charChunkedDict = {}
for term in indexDict:
if(term[0] in charChunkedDict):
charChunkedDict[term[0]][term] = indexDict[term]
else:
charChunkedDict[term[0]] = {}
def sortByDocFreq(termItem):
keys = list(termItem.keys())
return termItem[keys[0]][0]
print(len(charChunkedDict))
for alpha in charChunkedDict:
charChunkedDict[alpha] = OrderedDict(sorted(charChunkedDict[alpha].items(), key=lambda t: t[1][0], reverse=True))
if('a' <= alpha <= 'z'):
indexFileSuffix = alpha
else:
indexFileSuffix = 'special'
print(alpha)
json.dump(charChunkedDict[alpha], open(indexDir+'indexDump-'+indexFileSuffix+'.json','w'))
#json.dump(indexDict, open('indexDump.json','w'))
print(len(indexDict))