import nltk.chunk
import nltk
from nltk.corpus import conll2000
import itertools
from nltk import pos_tag, word_tokenize
"""
pos_tag(word_tokenize(trkStr))
train_data = [[(t,c) for w,t,c in nltk.chunk.tree2conlltags(snt)]
for snt in sent]
"""
class UnigramChunker(nltk.ChunkParserI):
def __init__(self, train_sents):
train_data = [[(t,c) for w,t,c in nltk.chunk.tree2conlltags(sent)]
for sent in train_sents]
self.tagger = nltk.UnigramTagger(train_data)
def parse2(self, tokens):
if len(tokens) > 0:
(words, tags) = zip(*tokens)
chunks = self.tagger.tag(tags)
wtc = itertools.izip(words, chunks)
lines = [' '.join([w, t, c]) for (w, (t, c)) in wtc if c]
else:
lines = []
return nltk.chunk.conllstr2tree('\n'.join(lines))
def clean_punctuation(text):
text = text.replace("."," .")
text = text.replace(","," ,")
text = text.replace("!"," !")
text = text.replace(":"," :")
text = text.replace("\""," \"")
return text
def run(q_id):
train_sents = conll2000.chunked_sents('train.txt')
unigram_chunker = UnigramChunker(train_sents)
import init
#get document here and tag; put into this format:
#tagged = [("bu", "DT"), ("biraz", "JJ"), ("cok", "JJ"),("fazla", "NN"), ("yenilmis", "VBD"), ("bir", "IN"), ("sey", "DT"), ("", "NN"),(".", ".")]
topdoc = init.get_corpus(q_id)
doc_nums = topdoc.keys()
answers= [];
for key in doc_nums:
doc_text = topdoc[key]
docnum= key
doc_text = clean_punctuation(doc_text)
doc_text= doc_text.split()
tagged=pos_tag(doc_text)
chunked=unigram_chunker.parse2(tagged)
flatten= chunked.pos()
numbered= enumerate(flatten)
currentTag=''
words=[]
for i,v in numbered:
#print i,v
((word,tag),phrasetag)=v
if currentTag=='':
currentTag=phrasetag
if currentTag==phrasetag:
words.append(word)
else:
answers.append((' '.join(words),docnum,i-len(words),currentTag,q_id))
currentTag= phrasetag
words= [word]
answers.append((' '.join(words),docnum,i-len(words),currentTag,q_id))
#print answers
return answers
if __name__=="__main__":
print run(213)bunun gibi machine learnıng algorıtmalarına eklemeler yapabılırsınızboyle bır seyey kalkısmadan evel bayes ve hıdden markow algorıtmalarını calısmanızı tavsıye ederım
bu ıkı algo mesela en cok arama motorlarında kulanılan temel algorıtmalardır