Implemented cyk algorithm

This commit is contained in:
Mysaa 2023-11-26 19:28:37 +01:00
parent 898670ceec
commit 44708fbac8
Signed by: Mysaa
GPG Key ID: 7054D5D6A90F084F

View File

@ -1,4 +1,5 @@
from nltk.ccg import chart, lexicon
from nltk.tree import Tree
import pandas as pd
import numpy as np
@ -19,7 +20,7 @@ for i in range(n):
for j in range(3):
if isinstance(table['Cat'+str(j)][i],str):
for mot in table['MOT'][i].split('/'):
lexstring+=mot+' => ' + table['Cat'+str(j)][i] + '\n'
lexstring+=mot+' => ' + table['Cat'+str(j)][i] + '\n'
# Pour inverser les slash dans le lexicon
#lexstring = lexstring.replace('\\','#').replace('/','\\').replace('#','/')
@ -38,7 +39,7 @@ printDerivations=not printTotal
with open('phrases.txt') as f:
lines = f.readlines()
lines.append("mon voisin lui donne le chat")
lines.append("le chat et la souris dorment")
for phrase in lines:
# On met tout en minuscule
@ -58,3 +59,103 @@ with open('phrases.txt') as f:
if printTotal:
print(i,phrase)
from nltk.ccg.chart import CCGChart,CCGLeafEdge
valz = {
'>' : 0.8,
'<' : 0.7
}
def rweight(rule):
s = rule.__str__()
if s in valz:
return valz[s]
else:
return 1.0 # Base rules weight
# Implements the CYK algorithm
def cyk(parser, tokens, lex, rules):
chart = CCGChart(list(tokens))
chart2 = CCGChart(list(tokens))
# Initialize leaf edges.
for index in range(chart.num_leaves()):
for token in lex.categories(chart.leaf(index)):
new_edge = CCGLeafEdge(index, token, chart.leaf(index))
new_edge.weight = 1.0
chart.insert(new_edge, ())
chart2.insert(new_edge, ())
print(chart.pretty_format())
# Select a span for the new edges
for span in range(2, chart.num_leaves() + 1):
for start in range(0, chart.num_leaves() - span + 1):
bestedge = None
# Try all possible pairs of edges that could generate
# an edge for that span
for part in range(1, span):
lstart = start
mid = start + part
rend = start + span
for left in chart.select(span=(lstart, mid)):
for right in chart.select(span=(mid, rend)):
# Generate all possible combinations of the two edges
for rule in rules:
edgez = list(rule.apply(chart, lex, left, right))
if(len(edgez)==1):
edge = edgez[0]
edge.weight = rweight(rule) * left.weight * right.weight
edge.triple = (rule,left,right)
print(edge)
if (bestedge == None) or (bestedge.weight < edge.weight):
bestedge = edge
elif(len(edgez)!=0):
print("Too many new edges")
# end for rule loop
# end for right loop
# end for left loop
# end for part loop
if bestedge != None:
print("|",bestedge.triple,rweight(bestedge.triple[0]) * bestedge.triple[1].weight * bestedge.triple[2].weight)
e = list(bestedge.triple[0].apply(chart2,lex,bestedge.triple[1],bestedge.triple[2]))[0]
e.triple = bestedge.triple
print("-"*20)
return chart
def totree(edge):
if isinstance(edge,CCGLeafEdge):
return Tree((edge.token(),"Leaf"),[Tree(edge.token(),[edge.leaf()])])
else:
return Tree(
(chart.Token(None,edge.categ()),edge.triple[0].__str__()),
[totree(t) for t in (edge.triple[1:])])
def viterbiCKY(mots):
n = len(mots)
t = np.zeros((n+1,n+1))
# t[s,k] is the probability of obtaining the word mots[s] mots[s+1] ... mots[s+n-1]
for i in range(0,n):
t[i][1] = 1.0
for l in range(2,len(mots)+1):
for s in range(0,n-l+1):
# We want to set t[s][l]
for k in range(1,l): # Partitionning of the sequence
pass
#T ← ∅
#for 0 ≤ i ≤ n do
#δ(〈wi , i, i + 1〉) ← 1.0
#end for
#for all 〈X , i, j〉 ∈ V following a topological order do
#δ(〈X , i, j〉) ← 0
#for 〈X , i, j〉 → 〈Y1, i, k〉 〈Y2, k, j〉 ∈ IE (v ) do
#δ(〈X , i, j〉) ← max (δ(v ), ψ(e) × δ(〈Y1, i, k〉) × δ(〈Y2, k, j〉))
#end for
#end for
#end function