Final tidying up, added random tests

2023-11-28 23:19:42 +01:00 · 2023-11-28 23:19:42 +01:00 · c01ffefaac
commit c01ffefaac
parent 64d1791f8d
3 changed files with 1189 additions and 19 deletions
--- a/Report.ipynb
+++ b/Report.ipynb
@ -6,7 +6,6 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "from nltk.ccg.api import PrimitiveCategory\n",
    "from nltk.ccg.lexicon import CCGLexicon, Token, augParseCategory\n",
    "from nltk.ccg.chart import CCGChart,CCGLeafEdge,BinaryCombinatorRule,CCGEdge,CCGChartParser\n",
    "from nltk.ccg.chart import compute_semantics,printCCGDerivation\n",
@ -15,7 +14,8 @@
    "from nltk.sem.logic import Expression\n",
    "from numbers import Number\n",
    "import pandas as pd\n",
-    "import numpy as np"
+    "import random\n",
+    "from functools import reduce"
   ]
  },
  {
@ -39,14 +39,10 @@
   "metadata": {},
   "source": [
    "## 2.a Robustness of the grammar\n",
-    "Our grammar is really simple, as it has really few catégories. Therefore it is really easy to create sentences that are not grammatical. The good counterpart is that there is less probablity that a grammatical sentence will not be parsed.\n",
-    "\n",
-    "For exemple, the following agrammatical sentences are parsed:\n",
-    "- *manger lui donne lui mange*\n",
-    "- *mon chat mange par elle que souhaite il*\n",
+    "Our grammar is really simple, as it has really few catégories. However, every category has been highly tuned for its purpose, so very few agrammatical sentences slip through. However we did not take into account tenses, genre and plurals by choice, so some phrases that are parsed may be agrammatical in that regard - for instance, \"il le attrappe\" or \"est (elle souhaite ses fromage)\" or \"le souris\" are parsed. But for what we consider a successful parse, it is a strict grammar.\n",
    "\n",
    "## 2.b Ambiguity\n",
-    "The main cause of ambiguity is that there is very few catégories, therefore there is a lot of derivation trees going to the same goal. For example, because we do not differenciate *(méchant chat) noir* and *méchant (chat noir)* because both correspond to a reduction (pN pN pN -> pN pN -> pN). Because we have so few catégories, we don't have things like «adjectives order» that would fix the order in which those trees are parsed."
+    "The main cause of ambiguity is that there is very few catégories, therefore there is a lot of derivation trees going to the same goal. For example, because we do not differentiate *(méchant chat) noir* and *méchant (chat noir)* because both correspond to a reduction (pN pN pN -> pN pN -> pN). Because we have so few catégories, we don't have things like «adjectives order» that would fix the order in which those trees are parsed."
   ]
  },
  {
@ -71,7 +67,13 @@
    "    '>' : 0.8,\n",
    "    '<' : 0.8,\n",
    "    '<B' : 0.7,\n",
-    "    '>B' : 0.7\n",
+    "    '>B' : 0.7,\n",
+    "    '<Bx' : 0.6,\n",
+    "    '<Sx' : 0.6,\n",
+    "    '<S' : 0.65,\n",
+    "    '>Bx' : 0.6,\n",
+    "    '>Sx' : 0.6,\n",
+    "    '>S' : 0.65\n",
    "}\n",
    "def rweight(rule):\n",
    "    s = rule.__str__()\n",
@ -252,7 +254,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "def to_pseudo_entries(table, consider_semantics = False):\n",
+    "def to_pseudo_entries(table, consider_semantics = False, categories_max_options = 4):\n",
    "    \"\"\"returns a list of lists in the format ['word', 'category', 'weight', None]\n",
    "    if consider_semantics == false else ['word', 'category', weight, 'semantic']\n",
    "    that is left to be converted into tokens by to_wlex_entries\"\"\"\n",
@ -260,7 +262,7 @@
    "    entries = list()\n",
    "    for line in range(len(table['MOT'])):\n",
    "        for wdi, word in enumerate(table['MOT'][line].replace(\" \", \"\").split('/')):\n",
-    "            for j in range(3):\n",
+    "            for j in range(categories_max_options):\n",
    "                if isinstance(table['Cat'+str(j)][line],str):\n",
    "                    category = table['Cat'+str(j)][line]\n",
    "                    weight = float(table['Weights'+str(j)][line]) if isinstance(table['Weights'+str(j)][line], Number) else 1.0\n",
@ -307,13 +309,9 @@
    "\n",
    "# On importe notre lexique sous forme de tableur\n",
    "table = pd.read_excel(\"ccg.ods\", engine=\"odf\")\n",
-    "#print(table.keys())\n",
-    "\n",
    "# On le convertit en Lexique pondéré\n",
    "pe = to_pseudo_entries(table, consider_semantics = True)\n",
-    "#print(pe)\n",
    "wEntries = to_wlex_entries(pseudo_entries= pe, primitives= primitives, families= families)\n",
-    "#print([list(map(lambda x: f\"{k} : \"+ str(x) + str(x._semantics), L)) for k, L in wEntries.items()])\n",
    "lex = WeighedLexicon(start= 'S', primitives= primitives, families= families, entries= wEntries)\n"
   ]
  },
@ -330,7 +328,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "# On crée le parser, on donne l'ensemble des règles qu'il est cencé connaître\n",
+    "# On crée le parser, on donne l'ensemble des règles qu'il est sensé connaître\n",
    "rulesC  = [ForwardApplication,BackwardApplication] \n",
    "rulesC += [ForwardComposition,BackwardComposition,BackwardBx]\n",
    "rulesC += [ForwardSubstitution,BackwardSx]\n",
@ -382,17 +380,73 @@
   "source": [
    "for tokens in phrases:\n",
    "    \n",
+    "    print(reduce(lambda x,y: x + \" \" + y,tokens, \"\"))\n",
    "    # On compte les arbres de dérivation trouvés\n",
-    "    i = len(list(parser.parse(tokens)))\n",
+    "    try:\n",
+    "        i = len(list(parser.parse(tokens)))\n",
+    "    except:\n",
+    "            print(\"#SOME RANDOM ASSERT ERROR EVEN IF EVERYTHING WORKS FINE#\")\n",
+    "    \n",
    "    print(\"Found\",i,\"derivations for sentence\",*tokens)\n",
    "\n",
    "    # On affiche la dérivation la meilleure pour l'arbre\n",
    "    if (i != 0):\n",
-    "        t,d = bestTree(tokens, lex, rulesC)\n",
+    "        try:\n",
+    "            t,d = bestTree(tokens, lex, rulesC)\n",
+    "        except:\n",
+    "            print(\"#SOME RANDOM ASSERT ERROR EVEN IF EVERYTHING WORKS FINE#\")\n",
    "        print(\"Best derivation tree has weight\",d)\n",
+    "        print('\\n')\n",
    "        printCCGDerivation(t)\n",
+    "    print('\\n')\n",
+    "    print(\"#\"*42)\n",
+    "    print('\\n')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Randomized testing\n",
    "\n",
-    "    print(\"#\"*42)"
+    "The code that we have used for testing strictness."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def import_words(table):\n",
+    "    return list({word for line in range(len(table['MOT'])) for word in table['MOT'][line].replace(\" \", \"\").split('/')})\n",
+    "\n",
+    "def random_tests():\n",
+    "    Words = import_words(table)\n",
+    "    random_phrases = [reduce(lambda x,y: x + \" \" + y + \" \", random.sample(Words, random.sample(range(2,11), 1)[0]), \"\") for i in range(500)]\n",
+    "\n",
+    "    parsed = list()\n",
+    "    parses = dict()\n",
+    "    unparsed = list()\n",
+    "    for phr in random_phrases:\n",
+    "        try:\n",
+    "            t,d = bestTree(phr.split(), lex, rulesC)\n",
+    "            parsed.append(phr)\n",
+    "            parses[phr] = t\n",
+    "        except:\n",
+    "            unparsed.append(phr)\n",
+    "\n",
+    "    print(\"=\"*50)\n",
+    "    print(f\"found the following {len(parsed)} derivations:\")\n",
+    "    for phr in parsed:\n",
+    "        print(phr + \" :\")\n",
+    "        printCCGDerivation(parses[phr])\n",
+    "    print(\"=\"*50)\n",
+    "    print(f'{len(unparsed)} are left unparsed :')\n",
+    "    for phr in unparsed:\n",
+    "        print(phr)\n",
+    "\n",
+    "random_tests()"
   ]
  },
  {
--- a/ccg.ods
+++ b/ccg.ods
--- a/randomizedTests.txt
+++ b/randomizedTests.txt