from BeautifulSoup import BeautifulStoneSoup
import os
import glob
global_doc_dict = {}
def get_doc(doc_num):
return global_doc_dict[doc_num]
def get_corpus(qNum=0):
path = 'corpus'
dict = {}
if qNum == 0:
fname = 'top_docs.*'
else:
fname = 'top_docs.' + str(qNum)
for infile in glob.glob( os.path.join(path, fname) ):
f = open(infile)
soup = BeautifulStoneSoup(f)
for d in soup('doc'):
docNum = d.findNext('docno').renderContents().strip()
dT = d.findNext('text')
if dT != None:
docText = dT.renderContents().strip()
docText = " ".join(docText.split("\n"))
dict[docNum] = docText
global_doc_dict[docNum] = docText
return dict
if __name__ == "__main__":
get_corpus(qNum=204)