from BeautifulSoup import BeautifulStoneSoup 
import os 
import glob

global_doc_dict = {}

def get_doc(doc_num):
    return global_doc_dict[doc_num]
  
def get_corpus(qNum=0):  
    path = 'corpus'
    dict = {}
    

    if qNum == 0:
        fname = 'top_docs.*'
    else:
        fname = 'top_docs.' + str(qNum)
    

    for infile in glob.glob( os.path.join(path, fname) ):
        f = open(infile)
        soup = BeautifulStoneSoup(f)


        for d in soup('doc'):
            docNum = d.findNext('docno').renderContents().strip()
            dT = d.findNext('text')
            

            if dT != None:
                docText = dT.renderContents().strip()
                docText = " ".join(docText.split("\n"))
                dict[docNum] = docText

                global_doc_dict[docNum] = docText

    return dict

if __name__ == "__main__":
    get_corpus(qNum=204)