frflccg/ccg-test.py
2023-11-26 19:39:50 +01:00

148 lines
4.9 KiB
Python
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from nltk.ccg import chart, lexicon
from nltk.ccg.chart import CCGChart,CCGLeafEdge
from nltk.tree import Tree
import pandas as pd
import numpy as np
valz = {
'>' : 0.8,
'<' : 0.7
}
def rweight(rule):
s = rule.__str__()
if s in valz:
return valz[s]
else:
return 1.0 # Base rules weight
# Implements the CYK algorithm, code partly taken from nltk
def weightedParse(tokens, lex, rules):
chart = CCGChart(list(tokens))
# Initialize leaf edges.
for index in range(chart.num_leaves()):
for token in lex.categories(chart.leaf(index)):
new_edge = CCGLeafEdge(index, token, chart.leaf(index))
new_edge.weight = 1.0
chart.insert(new_edge, ())
# Select a span for the new edges
for span in range(2, chart.num_leaves() + 1):
for start in range(0, chart.num_leaves() - span + 1):
bestedge = None
# Try all possible pairs of edges that could generate
# an edge for that span
for part in range(1, span):
lstart = start
mid = start + part
rend = start + span
for left in chart.select(span=(lstart, mid)):
for right in chart.select(span=(mid, rend)):
# Generate all possible combinations of the two edges
for rule in rules:
edgez = list(rule.apply(chart, lex, left, right))
if(len(edgez)==1):
edge = edgez[0]
edge.weight = rweight(rule) * left.weight * right.weight
edge.triple = (rule,left,right)
if (bestedge == None) or (bestedge.weight < edge.weight):
bestedge = edge
elif(len(edgez)!=0):
print("Too many new edges (unsupported rule used)")
# end for rule loop
# end for right loop
# end for left loop
# end for part loop
return chart
def wpToTree(edge):
if isinstance(edge,CCGLeafEdge):
return Tree((edge.token(),"Leaf"),[Tree(edge.token(),[edge.leaf()])])
else:
return Tree(
(chart.Token(None,edge.categ()),edge.triple[0].__str__()),
[wpToTree(t) for t in (edge.triple[1:])])
def bestTree(tokens, lex, rules):
# We build the weighgted parse tree using cky
w = weightedParse(tokens, lex, rules)
# We get the biggest edge
e = list(w.select(start=0,end=len(tokens)))[0]
# We get the tree that brought us to this edge
return (wpToTree(e),e.weight)
# On importe notre lexique sous forme de tableur
table = pd.read_excel("CategoriesGramaticalesCombinatoire.ods", engine="odf")
# On récupère le nombre de mots qui ont été définis
n = len(table['MOT'])
# On donne la liste des catégories primitives
lexstring=':- S,N,Pp\n'
# On ajoute la notation V pour N\S
lexstring+='V :: S\\N\n'
# On lis les données depuis le tableur en une chaine de caractère parsable
for i in range(n):
for j in range(3):
if isinstance(table['Cat'+str(j)][i],str):
for mot in table['MOT'][i].split('/'):
lexstring+=mot+' => ' + table['Cat'+str(j)][i] + '\n'
# Pour inverser les slash dans le lexicon
#lexstring = lexstring.replace('\\','#').replace('/','\\').replace('#','/')
# On crée notre lexique
lex = lexicon.fromstring(lexstring)
# On crée le parser, on donne l'ensemble des règles qu'il est cencé connaître
parser = chart.CCGChartParser(lex, chart.DefaultRuleSet)
#parser = chart.CCGChartParser(lex, chart.ApplicationRuleSet)
printTotal=True
printDerivations=not printTotal
# On lit les phrases dans le fichier
with open('phrases.txt') as f:
lines = f.readlines()
lines.append("le chat et la souris dorment")
for phrase in lines:
# On met tout en minuscule
phrase = phrase.lower().strip()
if printDerivations:
print("============================================================================")
print('#',phrase)
lex = lexicon.fromstring(lexstring)
parser = chart.CCGChartParser(lex, chart.ApplicationRuleSet)
# Et on affiche tous les arbres de dérivation trouvés
i=0
for parse in parser.parse(phrase.split()):
i+=1
if printDerivations:
chart.printCCGDerivation(parse)
if printTotal:
print(i,phrase)
# On affiche la dérivation la meilleure pour l'arbre
if (i==0):
print("Pas de dérivation tout court :/")
else:
t,d = bestTree(phrase.split(), lex, chart.ApplicationRuleSet)
print("Found derivation tree with weight",d)
chart.printCCGDerivation(t)