diff --git a/ccg-test.py b/ccg-test.py index 501d54a..4d7b660 100644 --- a/ccg-test.py +++ b/ccg-test.py @@ -1,4 +1,5 @@ from nltk.ccg import chart, lexicon +from nltk.tree import Tree import pandas as pd import numpy as np @@ -19,7 +20,7 @@ for i in range(n): for j in range(3): if isinstance(table['Cat'+str(j)][i],str): for mot in table['MOT'][i].split('/'): - lexstring+=mot+' => ' + table['Cat'+str(j)][i] + '\n' + lexstring+=mot+' => ' + table['Cat'+str(j)][i] + '\n' # Pour inverser les slash dans le lexicon #lexstring = lexstring.replace('\\','#').replace('/','\\').replace('#','/') @@ -38,8 +39,8 @@ printDerivations=not printTotal with open('phrases.txt') as f: lines = f.readlines() - lines.append("mon voisin lui donne le chat") - + lines.append("le chat et la souris dorment") + for phrase in lines: # On met tout en minuscule phrase = phrase.lower().strip() @@ -55,6 +56,106 @@ with open('phrases.txt') as f: i+=1 if printDerivations: chart.printCCGDerivation(parse) - + if printTotal: print(i,phrase) + +from nltk.ccg.chart import CCGChart,CCGLeafEdge + + +valz = { + '>' : 0.8, + '<' : 0.7 +} +def rweight(rule): + s = rule.__str__() + if s in valz: + return valz[s] + else: + return 1.0 # Base rules weight + +# Implements the CYK algorithm +def cyk(parser, tokens, lex, rules): + chart = CCGChart(list(tokens)) + chart2 = CCGChart(list(tokens)) + + # Initialize leaf edges. + for index in range(chart.num_leaves()): + for token in lex.categories(chart.leaf(index)): + new_edge = CCGLeafEdge(index, token, chart.leaf(index)) + new_edge.weight = 1.0 + chart.insert(new_edge, ()) + chart2.insert(new_edge, ()) + print(chart.pretty_format()) + + # Select a span for the new edges + for span in range(2, chart.num_leaves() + 1): + for start in range(0, chart.num_leaves() - span + 1): + + bestedge = None + + # Try all possible pairs of edges that could generate + # an edge for that span + for part in range(1, span): + lstart = start + mid = start + part + rend = start + span + + for left in chart.select(span=(lstart, mid)): + for right in chart.select(span=(mid, rend)): + # Generate all possible combinations of the two edges + for rule in rules: + edgez = list(rule.apply(chart, lex, left, right)) + if(len(edgez)==1): + edge = edgez[0] + edge.weight = rweight(rule) * left.weight * right.weight + edge.triple = (rule,left,right) + print(edge) + if (bestedge == None) or (bestedge.weight < edge.weight): + bestedge = edge + elif(len(edgez)!=0): + print("Too many new edges") + + # end for rule loop + # end for right loop + # end for left loop + # end for part loop + if bestedge != None: + print("|",bestedge.triple,rweight(bestedge.triple[0]) * bestedge.triple[1].weight * bestedge.triple[2].weight) + e = list(bestedge.triple[0].apply(chart2,lex,bestedge.triple[1],bestedge.triple[2]))[0] + e.triple = bestedge.triple + + print("-"*20) + return chart + +def totree(edge): + if isinstance(edge,CCGLeafEdge): + return Tree((edge.token(),"Leaf"),[Tree(edge.token(),[edge.leaf()])]) + else: + return Tree( + (chart.Token(None,edge.categ()),edge.triple[0].__str__()), + [totree(t) for t in (edge.triple[1:])]) + +def viterbiCKY(mots): + n = len(mots) + t = np.zeros((n+1,n+1)) + # t[s,k] is the probability of obtaining the word mots[s] mots[s+1] ... mots[s+n-1] + for i in range(0,n): + t[i][1] = 1.0 + for l in range(2,len(mots)+1): + for s in range(0,n-l+1): + # We want to set t[s][l] + for k in range(1,l): # Partitionning of the sequence + pass + +#T ← ∅ +#for 0 ≤ i ≤ n do +#δ(〈wi , i, i + 1〉) ← 1.0 +#end for +#for all 〈X , i, j〉 ∈ V following a topological order do +#δ(〈X , i, j〉) ← 0 +#for 〈X , i, j〉 → 〈Y1, i, k〉 〈Y2, k, j〉 ∈ IE (v ) do +#δ(〈X , i, j〉) ← max (δ(v ), ψ(e) × δ(〈Y1, i, k〉) × δ(〈Y2, k, j〉)) +#end for +#end for +#end function