/
threadedSearch.py
162 lines (128 loc) · 6.08 KB
/
threadedSearch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
#! /usr/bin/env python
# encoding: utf-8
# Multithreading Searcher and Indexer.
# One Thread Indexes new documents in the background
# while thread in the foreground waits for new user queries
# This searcher and indexer works from the terminal, simply start it.
# It begins indexing files in the directory you point it to.
# import needed system modules
import os
import threading
# Import necessary Py-Lucene modules
import lucene
from org.apache.lucene.store import SimpleFSDirectory
from org.apache.lucene.analysis.standard import StandardAnalyzer
from org.apache.lucene.util import Version
from org.apache.lucene.index import IndexWriter
from org.apache.lucene.index import IndexWriterConfig, DirectoryReader
from org.apache.lucene.search import IndexSearcher
from org.apache.lucene.queryparser.classic import QueryParser
from org.apache.lucene.document import Document, Field, TextField
from org.apache.lucene.search.highlight import Highlighter, QueryScorer
from org.apache.lucene.search.highlight import SimpleFragmenter
from org.apache.lucene.search.highlight import NullFragmenter
from org.apache.lucene.search.highlight import SimpleHTMLFormatter
from java.io import File
class Indexer(threading.Thread):
# set some initial values for the class, the root directory to
# start indexing and pass in a writer instance
def __init__(self, root, writer, directoryToWalk):
threading.Thread.__init__(self)
self.root = root
self.writer = writer
self.directory = directoryToWalk
def run(self):
env.attachCurrentThread()
self.indexDocs()
# start indexing beginning at the root directory
def indexDocs(self):
for self.root, dirnames, filenames in os.walk(self.directory):
for filename in filenames:
try:
path = os.path.join(self.root, filename)
file = open(path)
contents = unicode(file.read(), 'iso-8859-1')
file.close()
doc = Document()
doc.add(Field("name", filename, TextField.TYPE_STORED))
doc.add(Field("path", path, TextField.TYPE_STORED))
if len(contents) > 0:
doc.add(Field("contents", contents,
TextField.TYPE_STORED))
else:
print "warning: the file is empty %s" % filename
self.writer.addDocument(doc)
self.writer.commit()
except Exception, e:
print "Failed in indexDocs:", e
class Queryer():
def __init__(self, store_dir, hits_dir, frags_dir=None):
# store_dir is the location of our generated lucene index
# hits_dir is the location of the highlighted document hits
# frags_dif is the location of the document hit fragments - optional
self.store_dir = store_dir
self.hits_dir = hits_dir
self.frags_dir = frags_dir
if not os.path.exists(self.store_dir):
os.mkdir(self.store_dir)
if not os.path.exists(self.hits_dir):
os.mkdir(self.hits_dir)
if self.frags_dir is not None and not os.path.exists(self.frags_dir):
os.mkdir(self.frags_dir)
self.directory = SimpleFSDirectory(File(self.store_dir))
# For now I just use the StandardAnalyzer
self.analyzer = StandardAnalyzer(Version.LUCENE_43)
config = IndexWriterConfig(Version.LUCENE_43, self.analyzer)
self.writer = IndexWriter(self.directory, config)
def run(self, writer=None, analyzer=None):
if writer is None:
writer = self.writer
if analyzer is None:
analyzer = self.analyzer
searcher = IndexSearcher(DirectoryReader.open(\
SimpleFSDirectory.open(File(self.store_dir))))
while True:
print
print "Hit enter with no input to quit."
command = raw_input("Query:")
if command == '':
return
print "Searching for:", command
query = QueryParser(Version.LUCENE_43, "contents",
analyzer).parse(command)
# We'll just show the top 10 matching documents for now
scoreDocs = searcher.search(query, 10).scoreDocs
print "%s total matching documents." % len(scoreDocs)
# Highlight the matching text in red
highlighter = Highlighter(SimpleHTMLFormatter('<b><font color\
="red">', '</font></b>'), QueryScorer(query))
# Using NullFragmenter since we still want to see
# the whole document
highlighter.setTextFragmenter(NullFragmenter())
for scoreDoc in scoreDocs:
doc = searcher.doc(scoreDoc.doc)
tokenStream = analyzer.tokenStream("contents",
StringReader(doc.get("contents")))
# arg 3: the maximum number of fragments
# arg 4: the separator used to intersperse the
# document fragments (typically "...")
# arg 3 and 4 don't really matter with NullFragmenter
result = highlighter.getBestFragments(tokenStream,
doc.get("contents"), 2, "...")
if len(result) > 10:
file_handler = open(self.hits_dir + '/' + doc.get("name"),
'w+')
file_handler.write(result)
# create hit fragments, if we want to show them
# arg 1: fragment size
highlighter.setTextFragmenter(SimpleFragmenter(200))
for scoreDoc in scoreDocs:
doc = searcher.doc(scoreDoc.doc)
tokenStream = analyzer.tokenStream("contents",
StringReader(doc.get("contents")))
result = highlighter.getBestFragments(tokenStream,
doc.get("contents"), 2, "...")
if len(result) > 10:
file_handler = open(self.frags_dir + '/' + doc.get("name"),
'w+')
file_handler.write(result)