frflccg/ccg-test.py
2023-11-26 19:28:37 +01:00

162 lines
5.5 KiB
Python
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from nltk.ccg import chart, lexicon
from nltk.tree import Tree
import pandas as pd
import numpy as np
# On importe notre lexique sous forme de tableur
table = pd.read_excel("CategoriesGramaticalesCombinatoire.ods", engine="odf")
# On récupère le nombre de mots qui ont été définis
n = len(table['MOT'])
# On donne la liste des catégories primitives
lexstring=':- S,N,Pp\n'
# On ajoute la notation V pour N\S
lexstring+='V :: S\\N\n'
# On lis les données depuis le tableur en une chaine de caractère parsable
for i in range(n):
for j in range(3):
if isinstance(table['Cat'+str(j)][i],str):
for mot in table['MOT'][i].split('/'):
lexstring+=mot+' => ' + table['Cat'+str(j)][i] + '\n'
# Pour inverser les slash dans le lexicon
#lexstring = lexstring.replace('\\','#').replace('/','\\').replace('#','/')
# On crée notre lexique
lex = lexicon.fromstring(lexstring)
# On crée le parser, on donne l'ensemble des règles qu'il est cencé connaître
parser = chart.CCGChartParser(lex, chart.DefaultRuleSet)
#parser = chart.CCGChartParser(lex, chart.ApplicationRuleSet)
printTotal=True
printDerivations=not printTotal
# On lit les phrases dans le fichier
with open('phrases.txt') as f:
lines = f.readlines()
lines.append("le chat et la souris dorment")
for phrase in lines:
# On met tout en minuscule
phrase = phrase.lower().strip()
if printDerivations:
print("============================================================================")
print('#',phrase)
lex = lexicon.fromstring(lexstring)
parser = chart.CCGChartParser(lex, chart.ApplicationRuleSet)
# Et on affiche tous les arbres de dérivation trouvés
i=0
for parse in parser.parse(phrase.split()):
i+=1
if printDerivations:
chart.printCCGDerivation(parse)
if printTotal:
print(i,phrase)
from nltk.ccg.chart import CCGChart,CCGLeafEdge
valz = {
'>' : 0.8,
'<' : 0.7
}
def rweight(rule):
s = rule.__str__()
if s in valz:
return valz[s]
else:
return 1.0 # Base rules weight
# Implements the CYK algorithm
def cyk(parser, tokens, lex, rules):
chart = CCGChart(list(tokens))
chart2 = CCGChart(list(tokens))
# Initialize leaf edges.
for index in range(chart.num_leaves()):
for token in lex.categories(chart.leaf(index)):
new_edge = CCGLeafEdge(index, token, chart.leaf(index))
new_edge.weight = 1.0
chart.insert(new_edge, ())
chart2.insert(new_edge, ())
print(chart.pretty_format())
# Select a span for the new edges
for span in range(2, chart.num_leaves() + 1):
for start in range(0, chart.num_leaves() - span + 1):
bestedge = None
# Try all possible pairs of edges that could generate
# an edge for that span
for part in range(1, span):
lstart = start
mid = start + part
rend = start + span
for left in chart.select(span=(lstart, mid)):
for right in chart.select(span=(mid, rend)):
# Generate all possible combinations of the two edges
for rule in rules:
edgez = list(rule.apply(chart, lex, left, right))
if(len(edgez)==1):
edge = edgez[0]
edge.weight = rweight(rule) * left.weight * right.weight
edge.triple = (rule,left,right)
print(edge)
if (bestedge == None) or (bestedge.weight < edge.weight):
bestedge = edge
elif(len(edgez)!=0):
print("Too many new edges")
# end for rule loop
# end for right loop
# end for left loop
# end for part loop
if bestedge != None:
print("|",bestedge.triple,rweight(bestedge.triple[0]) * bestedge.triple[1].weight * bestedge.triple[2].weight)
e = list(bestedge.triple[0].apply(chart2,lex,bestedge.triple[1],bestedge.triple[2]))[0]
e.triple = bestedge.triple
print("-"*20)
return chart
def totree(edge):
if isinstance(edge,CCGLeafEdge):
return Tree((edge.token(),"Leaf"),[Tree(edge.token(),[edge.leaf()])])
else:
return Tree(
(chart.Token(None,edge.categ()),edge.triple[0].__str__()),
[totree(t) for t in (edge.triple[1:])])
def viterbiCKY(mots):
n = len(mots)
t = np.zeros((n+1,n+1))
# t[s,k] is the probability of obtaining the word mots[s] mots[s+1] ... mots[s+n-1]
for i in range(0,n):
t[i][1] = 1.0
for l in range(2,len(mots)+1):
for s in range(0,n-l+1):
# We want to set t[s][l]
for k in range(1,l): # Partitionning of the sequence
pass
#T ← ∅
#for 0 ≤ i ≤ n do
#δ(〈wi , i, i + 1〉) ← 1.0
#end for
#for all 〈X , i, j〉 ∈ V following a topological order do
#δ(〈X , i, j〉) ← 0
#for 〈X , i, j〉 → 〈Y1, i, k〉 〈Y2, k, j〉 ∈ IE (v ) do
#δ(〈X , i, j〉) ← max (δ(v ), ψ(e) × δ(〈Y1, i, k〉) × δ(〈Y2, k, j〉))
#end for
#end for
#end function