Cleaned up code

This commit is contained in:
Mysaa 2023-11-26 19:39:50 +01:00
parent 44708fbac8
commit 84e9f74c3a
Signed by: Mysaa
GPG Key ID: 7054D5D6A90F084F

View File

@ -1,9 +1,83 @@
from nltk.ccg import chart, lexicon from nltk.ccg import chart, lexicon
from nltk.ccg.chart import CCGChart,CCGLeafEdge
from nltk.tree import Tree from nltk.tree import Tree
import pandas as pd import pandas as pd
import numpy as np import numpy as np
valz = {
'>' : 0.8,
'<' : 0.7
}
def rweight(rule):
s = rule.__str__()
if s in valz:
return valz[s]
else:
return 1.0 # Base rules weight
# Implements the CYK algorithm, code partly taken from nltk
def weightedParse(tokens, lex, rules):
chart = CCGChart(list(tokens))
# Initialize leaf edges.
for index in range(chart.num_leaves()):
for token in lex.categories(chart.leaf(index)):
new_edge = CCGLeafEdge(index, token, chart.leaf(index))
new_edge.weight = 1.0
chart.insert(new_edge, ())
# Select a span for the new edges
for span in range(2, chart.num_leaves() + 1):
for start in range(0, chart.num_leaves() - span + 1):
bestedge = None
# Try all possible pairs of edges that could generate
# an edge for that span
for part in range(1, span):
lstart = start
mid = start + part
rend = start + span
for left in chart.select(span=(lstart, mid)):
for right in chart.select(span=(mid, rend)):
# Generate all possible combinations of the two edges
for rule in rules:
edgez = list(rule.apply(chart, lex, left, right))
if(len(edgez)==1):
edge = edgez[0]
edge.weight = rweight(rule) * left.weight * right.weight
edge.triple = (rule,left,right)
if (bestedge == None) or (bestedge.weight < edge.weight):
bestedge = edge
elif(len(edgez)!=0):
print("Too many new edges (unsupported rule used)")
# end for rule loop
# end for right loop
# end for left loop
# end for part loop
return chart
def wpToTree(edge):
if isinstance(edge,CCGLeafEdge):
return Tree((edge.token(),"Leaf"),[Tree(edge.token(),[edge.leaf()])])
else:
return Tree(
(chart.Token(None,edge.categ()),edge.triple[0].__str__()),
[wpToTree(t) for t in (edge.triple[1:])])
def bestTree(tokens, lex, rules):
# We build the weighgted parse tree using cky
w = weightedParse(tokens, lex, rules)
# We get the biggest edge
e = list(w.select(start=0,end=len(tokens)))[0]
# We get the tree that brought us to this edge
return (wpToTree(e),e.weight)
# On importe notre lexique sous forme de tableur # On importe notre lexique sous forme de tableur
table = pd.read_excel("CategoriesGramaticalesCombinatoire.ods", engine="odf") table = pd.read_excel("CategoriesGramaticalesCombinatoire.ods", engine="odf")
@ -59,103 +133,15 @@ with open('phrases.txt') as f:
if printTotal: if printTotal:
print(i,phrase) print(i,phrase)
from nltk.ccg.chart import CCGChart,CCGLeafEdge
# On affiche la dérivation la meilleure pour l'arbre
if (i==0):
valz = { print("Pas de dérivation tout court :/")
'>' : 0.8, else:
'<' : 0.7 t,d = bestTree(phrase.split(), lex, chart.ApplicationRuleSet)
} print("Found derivation tree with weight",d)
def rweight(rule): chart.printCCGDerivation(t)
s = rule.__str__()
if s in valz:
return valz[s]
else:
return 1.0 # Base rules weight
# Implements the CYK algorithm
def cyk(parser, tokens, lex, rules):
chart = CCGChart(list(tokens))
chart2 = CCGChart(list(tokens))
# Initialize leaf edges.
for index in range(chart.num_leaves()):
for token in lex.categories(chart.leaf(index)):
new_edge = CCGLeafEdge(index, token, chart.leaf(index))
new_edge.weight = 1.0
chart.insert(new_edge, ())
chart2.insert(new_edge, ())
print(chart.pretty_format())
# Select a span for the new edges
for span in range(2, chart.num_leaves() + 1):
for start in range(0, chart.num_leaves() - span + 1):
bestedge = None
# Try all possible pairs of edges that could generate
# an edge for that span
for part in range(1, span):
lstart = start
mid = start + part
rend = start + span
for left in chart.select(span=(lstart, mid)):
for right in chart.select(span=(mid, rend)):
# Generate all possible combinations of the two edges
for rule in rules:
edgez = list(rule.apply(chart, lex, left, right))
if(len(edgez)==1):
edge = edgez[0]
edge.weight = rweight(rule) * left.weight * right.weight
edge.triple = (rule,left,right)
print(edge)
if (bestedge == None) or (bestedge.weight < edge.weight):
bestedge = edge
elif(len(edgez)!=0):
print("Too many new edges")
# end for rule loop
# end for right loop
# end for left loop
# end for part loop
if bestedge != None:
print("|",bestedge.triple,rweight(bestedge.triple[0]) * bestedge.triple[1].weight * bestedge.triple[2].weight)
e = list(bestedge.triple[0].apply(chart2,lex,bestedge.triple[1],bestedge.triple[2]))[0]
e.triple = bestedge.triple
print("-"*20)
return chart
def totree(edge):
if isinstance(edge,CCGLeafEdge):
return Tree((edge.token(),"Leaf"),[Tree(edge.token(),[edge.leaf()])])
else:
return Tree(
(chart.Token(None,edge.categ()),edge.triple[0].__str__()),
[totree(t) for t in (edge.triple[1:])])
def viterbiCKY(mots):
n = len(mots)
t = np.zeros((n+1,n+1))
# t[s,k] is the probability of obtaining the word mots[s] mots[s+1] ... mots[s+n-1]
for i in range(0,n):
t[i][1] = 1.0
for l in range(2,len(mots)+1):
for s in range(0,n-l+1):
# We want to set t[s][l]
for k in range(1,l): # Partitionning of the sequence
pass
#T ← ∅
#for 0 ≤ i ≤ n do
#δ(〈wi , i, i + 1〉) ← 1.0
#end for
#for all 〈X , i, j〉 ∈ V following a topological order do
#δ(〈X , i, j〉) ← 0
#for 〈X , i, j〉 → 〈Y1, i, k〉 〈Y2, k, j〉 ∈ IE (v ) do
#δ(〈X , i, j〉) ← max (δ(v ), ψ(e) × δ(〈Y1, i, k〉) × δ(〈Y2, k, j〉))
#end for
#end for
#end function