From ea75172611277b7399715afe41ade1d523df08f7 Mon Sep 17 00:00:00 2001
From: Johannes Maucher <johannes@Johanness-iMac.fritz.box>
Date: Thu, 29 Sep 2022 16:29:01 +0200
Subject: [PATCH] mc

---
 11ModellingWordsAndTexts.ipynb | 838 +++++++++++++++++++--------------
 1 file changed, 472 insertions(+), 366 deletions(-)

diff --git a/11ModellingWordsAndTexts.ipynb b/11ModellingWordsAndTexts.ipynb
index 0ead3ce..e8e1d6d 100644
--- a/11ModellingWordsAndTexts.ipynb
+++ b/11ModellingWordsAndTexts.ipynb
@@ -60,11 +60,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 32,
    "metadata": {},
    "outputs": [],
    "source": [
-    "import pandas as pd"
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "np.set_printoptions(suppress=True,precision=3)"
    ]
   },
   {
@@ -76,104 +78,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {
     "slideshow": {
      "slide_type": "slide"
     }
    },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\n",
-      "Word Index:\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>0</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>all</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>and</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>at</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>boys</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>girls</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>5</th>\n",
-       "      <td>home</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>6</th>\n",
-       "      <td>kids</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>7</th>\n",
-       "      <td>not</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>8</th>\n",
-       "      <td>stay</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "       0\n",
-       "0    all\n",
-       "1    and\n",
-       "2     at\n",
-       "3   boys\n",
-       "4  girls\n",
-       "5   home\n",
-       "6   kids\n",
-       "7    not\n",
-       "8   stay"
-      ]
-     },
-     "execution_count": 2,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "simpleWordDF=pd.DataFrame(data=[\"all\", \"and\", \"at\", \"boys\", \"girls\", \"home\", \"kids\", \"not\", \"stay\"])\n",
     "print(\"\\nWord Index:\")\n",
@@ -182,184 +93,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "metadata": {
     "slideshow": {
      "slide_type": "slide"
     }
    },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\n",
-      "Corresponding One-Hot-Encoding\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>_all</th>\n",
-       "      <th>_and</th>\n",
-       "      <th>_at</th>\n",
-       "      <th>_boys</th>\n",
-       "      <th>_girls</th>\n",
-       "      <th>_home</th>\n",
-       "      <th>_kids</th>\n",
-       "      <th>_not</th>\n",
-       "      <th>_stay</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>1</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>0</td>\n",
-       "      <td>1</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>1</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>1</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>1</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>5</th>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>1</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>6</th>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>1</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>7</th>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>1</td>\n",
-       "      <td>0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>8</th>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>1</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "   _all  _and  _at  _boys  _girls  _home  _kids  _not  _stay\n",
-       "0     1     0    0      0       0      0      0     0      0\n",
-       "1     0     1    0      0       0      0      0     0      0\n",
-       "2     0     0    1      0       0      0      0     0      0\n",
-       "3     0     0    0      1       0      0      0     0      0\n",
-       "4     0     0    0      0       1      0      0     0      0\n",
-       "5     0     0    0      0       0      1      0     0      0\n",
-       "6     0     0    0      0       0      0      1     0      0\n",
-       "7     0     0    0      0       0      0      0     1      0\n",
-       "8     0     0    0      0       0      0      0     0      1"
-      ]
-     },
-     "execution_count": 3,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "print(\"\\nCorresponding One-Hot-Encoding\")\n",
     "pd.get_dummies(simpleWordDF,prefix=\"\")"
@@ -421,7 +161,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "metadata": {
     "slideshow": {
      "slide_type": "slide"
@@ -435,32 +175,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "metadata": {
     "slideshow": {
      "slide_type": "slide"
     }
    },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/Users/johannes/opt/anaconda3/envs/books/lib/python3.8/site-packages/sklearn/utils/deprecation.py:87: FutureWarning: Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.\n",
-      "  warnings.warn(msg, category=FutureWarning)\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "['all', 'and', 'at', 'boys', 'girls', 'home', 'kids', 'not', 'stay']"
-      ]
-     },
-     "execution_count": 5,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "corpus = ['not all kids stay at home.',\n",
     "          'all boys and girls stay not at home.',\n",
@@ -471,25 +192,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
    "metadata": {
     "slideshow": {
      "slide_type": "slide"
     }
    },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "array([[1, 0, 1, 0, 0, 1, 1, 1, 1],\n",
-       "       [1, 1, 1, 1, 1, 1, 0, 1, 1]])"
-      ]
-     },
-     "execution_count": 6,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "BoW.toarray()"
    ]
@@ -517,7 +226,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
    "metadata": {
     "slideshow": {
      "slide_type": "slide"
@@ -531,7 +240,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": null,
    "metadata": {
     "slideshow": {
      "slide_type": "slide"
@@ -544,27 +253,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": null,
    "metadata": {
     "slideshow": {
      "slide_type": "slide"
     }
    },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "array([[0.37863221, 0.        , 0.37863221, 0.        , 0.        ,\n",
-       "        0.37863221, 0.53215436, 0.37863221, 0.37863221],\n",
-       "       [0.30253071, 0.42519636, 0.30253071, 0.42519636, 0.42519636,\n",
-       "        0.30253071, 0.        , 0.30253071, 0.30253071]])"
-      ]
-     },
-     "execution_count": 9,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "tfidf_BoW.toarray()"
    ]
@@ -577,7 +272,13 @@
     }
    },
    "source": [
-    "As can be seen in the example above, words which appear in all documents are weighted by 0, i.e. they are considered to be not relevant."
+    "By inspecting the output of the previous cell, you may realize that this output doesn't fit to the expectation. We expect the tf-idf value of words which appear in all documents are 0. However, in the example above such words have non-zero tf-idf-values. The reason for this is that **in scikit-learn** the tf-idf is implemented in a slightly different way. As described in [scikit-learn TfidfVectorizer](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfTransformer.html) scikit-learn calculates the tf-idf as follows:\n",
+    "\n",
+    "$$\n",
+    "tfidf_{i,j}=tf_{i,j} \\cdot ( log \\frac{N}{df_j} + 1),\n",
+    "$$ \n",
+    "\n",
+    "Moreover, from the output of the previous cell we see that the tf-idf-values are normalized such that each vector has an L2-norm of 1."
    ]
   },
   {
@@ -667,7 +368,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 20,
    "metadata": {
     "slideshow": {
      "slide_type": "slide"
@@ -678,7 +379,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Number of Tokens: 999994\n",
+      "Number of Tokens: 2000000\n",
       "Dimension of a word vector: 300\n"
      ]
     }
@@ -704,29 +405,166 @@
     "))"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Show the words of the first 10 index-positions:"
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": 12,
-   "metadata": {
-    "slideshow": {
-     "slide_type": "slide"
-    }
-   },
+   "execution_count": 30,
+   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "than\n",
-      "First 10 components of word-vector: \n",
-      " [ 0.1016 -0.1216 -0.0356  0.0096 -0.1015  0.1766 -0.0593  0.032   0.0892\n",
-      " -0.0727]\n"
+      "0 ,\n",
+      "1 the\n",
+      "2 .\n",
+      "3 and\n",
+      "4 to\n",
+      "5 of\n",
+      "6 a\n",
+      "7 </s>\n",
+      "8 in\n",
+      "9 is\n"
      ]
     }
    ],
    "source": [
-    "print(words[100])\n",
-    "print(\"First 10 components of word-vector: \\n\",en_model[words[100]][:10])"
+    "for i in range(10):\n",
+    "    print(i,en_model.index_to_key[i])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Word vector of word at index 1:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([-0.052,  0.074, -0.013,  0.045, -0.034,  0.021,  0.007, -0.016,\n",
+       "       -0.018, -0.002, -0.102,  0.006,  0.026, -0.003, -0.059, -0.038,\n",
+       "        0.016,  0.015, -0.009, -0.018, -0.009, -0.008, -0.018,  0.009,\n",
+       "        0.001, -0.094,  0.014,  0.015, -0.039, -0.029,  0.009, -0.025,\n",
+       "       -0.01 , -0.221, -0.023, -0.009, -0.032,  0.082,  0.002,  0.028,\n",
+       "        0.007, -0.009, -0.035, -0.018, -0.071,  0.063, -0.009, -0.022,\n",
+       "       -0.006,  0.052, -0.031,  0.044, -0.011, -0.056,  0.009, -0.067,\n",
+       "        0.01 ,  0.057,  0.01 , -0.028,  0.047,  0.005,  0.003,  0.001,\n",
+       "        0.044,  0.007, -0.033,  0.009, -0.008,  0.007,  0.092,  0.031,\n",
+       "        0.054,  0.028, -0.02 , -0.033,  0.005,  0.036,  0.225,  0.093,\n",
+       "       -0.012,  0.009, -0.06 ,  0.068,  0.04 ,  0.001,  0.046, -0.044,\n",
+       "        0.006,  0.092, -0.041, -0.015, -0.023,  0.009,  0.059,  0.028,\n",
+       "        0.065, -0.057, -0.013,  0.047,  0.035, -0.012, -0.008, -0.131,\n",
+       "        0.013, -0.051,  0.011,  0.012, -0.022,  0.039,  0.022,  0.024,\n",
+       "        0.004,  0.115,  0.023, -0.047, -0.046, -0.019,  0.008, -0.03 ,\n",
+       "       -0.035, -0.029, -0.04 ,  0.024, -0.01 ,  0.058, -0.039, -0.012,\n",
+       "       -0.03 ,  0.247, -0.011,  0.036,  0.005,  0.209, -0.102,  0.034,\n",
+       "        0.069, -0.071,  0.027, -0.042,  0.008, -0.027,  0.007,  0.004,\n",
+       "        0.035, -0.006, -0.446,  0.01 , -0.012, -0.045, -0.17 ,  0.05 ,\n",
+       "        0.093, -0.004, -0.004,  0.032,  0.203,  0.061, -0.03 ,  0.023,\n",
+       "       -0.019,  0.017,  0.148, -0.018, -0.013,  0.069,  0.033, -0.03 ,\n",
+       "        0.043,  0.005,  0.023,  0.01 ,  0.073,  0.008, -0.005,  0.054,\n",
+       "       -0.032,  0.051,  0.029, -0.059, -0.   ,  0.049,  0.017, -0.014,\n",
+       "        0.036,  0.054, -0.001, -0.059,  0.016, -0.022, -0.02 ,  0.023,\n",
+       "       -0.068,  0.018,  0.003,  0.011,  0.047, -0.044,  0.032,  0.02 ,\n",
+       "       -0.065,  0.339,  0.07 , -0.022, -0.024, -0.003, -0.003, -0.062,\n",
+       "        0.012,  0.038, -0.02 ,  0.024, -0.088,  0.02 , -0.006, -0.026,\n",
+       "       -0.019, -0.026,  0.019, -0.042,  0.025,  0.083, -0.01 ,  0.129,\n",
+       "        0.062,  0.054,  0.019,  0.042,  0.18 , -0.001, -0.033, -0.056,\n",
+       "       -0.016,  0.049,  0.035, -0.042,  0.016, -0.077, -0.066,  0.05 ,\n",
+       "        0.01 ,  0.147, -0.071, -0.147,  0.474, -0.017, -0.005,  0.016,\n",
+       "        0.055, -0.063, -0.021,  0.012,  0.027,  0.006,  0.066,  0.011,\n",
+       "       -0.071, -0.021, -0.078, -0.029, -0.028, -0.157, -0.039,  0.005,\n",
+       "        0.02 , -0.003,  0.044,  0.028, -0.039,  0.037, -0.004, -0.016,\n",
+       "       -0.073, -0.164,  0.065, -0.006, -0.065, -0.198, -0.041, -0.153,\n",
+       "        0.002,  0.013, -0.236, -0.053, -0.004, -0.045,  0.011, -0.033,\n",
+       "       -0.055,  0.001,  0.017, -0.044, -0.058,  0.022, -0.078, -0.043,\n",
+       "       -0.025,  0.237,  0.   , -0.004], dtype=float32)"
+      ]
+     },
+     "execution_count": 33,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "en_model[words[1]]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Word vector of word *the*:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([-0.052,  0.074, -0.013,  0.045, -0.034,  0.021,  0.007, -0.016,\n",
+       "       -0.018, -0.002, -0.102,  0.006,  0.026, -0.003, -0.059, -0.038,\n",
+       "        0.016,  0.015, -0.009, -0.018, -0.009, -0.008, -0.018,  0.009,\n",
+       "        0.001, -0.094,  0.014,  0.015, -0.039, -0.029,  0.009, -0.025,\n",
+       "       -0.01 , -0.221, -0.023, -0.009, -0.032,  0.082,  0.002,  0.028,\n",
+       "        0.007, -0.009, -0.035, -0.018, -0.071,  0.063, -0.009, -0.022,\n",
+       "       -0.006,  0.052, -0.031,  0.044, -0.011, -0.056,  0.009, -0.067,\n",
+       "        0.01 ,  0.057,  0.01 , -0.028,  0.047,  0.005,  0.003,  0.001,\n",
+       "        0.044,  0.007, -0.033,  0.009, -0.008,  0.007,  0.092,  0.031,\n",
+       "        0.054,  0.028, -0.02 , -0.033,  0.005,  0.036,  0.225,  0.093,\n",
+       "       -0.012,  0.009, -0.06 ,  0.068,  0.04 ,  0.001,  0.046, -0.044,\n",
+       "        0.006,  0.092, -0.041, -0.015, -0.023,  0.009,  0.059,  0.028,\n",
+       "        0.065, -0.057, -0.013,  0.047,  0.035, -0.012, -0.008, -0.131,\n",
+       "        0.013, -0.051,  0.011,  0.012, -0.022,  0.039,  0.022,  0.024,\n",
+       "        0.004,  0.115,  0.023, -0.047, -0.046, -0.019,  0.008, -0.03 ,\n",
+       "       -0.035, -0.029, -0.04 ,  0.024, -0.01 ,  0.058, -0.039, -0.012,\n",
+       "       -0.03 ,  0.247, -0.011,  0.036,  0.005,  0.209, -0.102,  0.034,\n",
+       "        0.069, -0.071,  0.027, -0.042,  0.008, -0.027,  0.007,  0.004,\n",
+       "        0.035, -0.006, -0.446,  0.01 , -0.012, -0.045, -0.17 ,  0.05 ,\n",
+       "        0.093, -0.004, -0.004,  0.032,  0.203,  0.061, -0.03 ,  0.023,\n",
+       "       -0.019,  0.017,  0.148, -0.018, -0.013,  0.069,  0.033, -0.03 ,\n",
+       "        0.043,  0.005,  0.023,  0.01 ,  0.073,  0.008, -0.005,  0.054,\n",
+       "       -0.032,  0.051,  0.029, -0.059, -0.   ,  0.049,  0.017, -0.014,\n",
+       "        0.036,  0.054, -0.001, -0.059,  0.016, -0.022, -0.02 ,  0.023,\n",
+       "       -0.068,  0.018,  0.003,  0.011,  0.047, -0.044,  0.032,  0.02 ,\n",
+       "       -0.065,  0.339,  0.07 , -0.022, -0.024, -0.003, -0.003, -0.062,\n",
+       "        0.012,  0.038, -0.02 ,  0.024, -0.088,  0.02 , -0.006, -0.026,\n",
+       "       -0.019, -0.026,  0.019, -0.042,  0.025,  0.083, -0.01 ,  0.129,\n",
+       "        0.062,  0.054,  0.019,  0.042,  0.18 , -0.001, -0.033, -0.056,\n",
+       "       -0.016,  0.049,  0.035, -0.042,  0.016, -0.077, -0.066,  0.05 ,\n",
+       "        0.01 ,  0.147, -0.071, -0.147,  0.474, -0.017, -0.005,  0.016,\n",
+       "        0.055, -0.063, -0.021,  0.012,  0.027,  0.006,  0.066,  0.011,\n",
+       "       -0.071, -0.021, -0.078, -0.029, -0.028, -0.157, -0.039,  0.005,\n",
+       "        0.02 , -0.003,  0.044,  0.028, -0.039,  0.037, -0.004, -0.016,\n",
+       "       -0.073, -0.164,  0.065, -0.006, -0.065, -0.198, -0.041, -0.153,\n",
+       "        0.002,  0.013, -0.236, -0.053, -0.004, -0.045,  0.011, -0.033,\n",
+       "       -0.055,  0.001,  0.017, -0.044, -0.058,  0.022, -0.078, -0.043,\n",
+       "       -0.025,  0.237,  0.   , -0.004], dtype=float32)"
+      ]
+     },
+     "execution_count": 35,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "en_model.get_vector(\"the\")"
    ]
   },
   {
@@ -742,7 +580,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 22,
    "metadata": {
     "slideshow": {
      "slide_type": "slide"
@@ -752,19 +590,19 @@
     {
      "data": {
       "text/plain": [
-       "[('cars', 0.8045915365219116),\n",
-       " ('automobile', 0.7667388916015625),\n",
-       " ('vehicle', 0.7534859776496887),\n",
-       " ('Car', 0.7177952527999878),\n",
-       " ('truck', 0.6989946961402893),\n",
-       " ('SUV', 0.6896128058433533),\n",
-       " ('automobiles', 0.6783526539802551),\n",
-       " ('dealership', 0.6682884097099304),\n",
-       " ('garage', 0.6681075096130371),\n",
-       " ('driver', 0.6541328430175781)]"
+       "[('cars', 0.73371422290802),\n",
+       " ('vehicle', 0.7271659970283508),\n",
+       " ('automobile', 0.7021709680557251),\n",
+       " ('car--and', 0.7012600302696228),\n",
+       " ('car.But', 0.6894583106040955),\n",
+       " ('car.It', 0.6796760559082031),\n",
+       " ('car.So', 0.679090142250061),\n",
+       " ('car.Now', 0.6771497130393982),\n",
+       " ('car.', 0.6755067706108093),\n",
+       " ('car.When', 0.6720768809318542)]"
       ]
      },
-     "execution_count": 13,
+     "execution_count": 22,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -773,6 +611,274 @@
     "en_model.most_similar(\"car\")"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Show all words, whose vector is closer to the vector of *car* than the vector of *lorry*:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['vehicle',\n",
+       " 'cars',\n",
+       " 'driving',\n",
+       " 'driver',\n",
+       " 'bike',\n",
+       " 'vehicles',\n",
+       " 'Car',\n",
+       " 'truck',\n",
+       " 'rental',\n",
+       " 'garage',\n",
+       " 'auto',\n",
+       " 'passenger',\n",
+       " 'BMW',\n",
+       " 'motorcycle',\n",
+       " 'taxi',\n",
+       " 'bicycle',\n",
+       " 'automotive',\n",
+       " 'Audi',\n",
+       " 'automobile',\n",
+       " 'SUV',\n",
+       " 'parked',\n",
+       " 'driveway',\n",
+       " 'dealership',\n",
+       " 'sedan',\n",
+       " 'windshield',\n",
+       " 'automobiles',\n",
+       " 'scooter',\n",
+       " 'coupe',\n",
+       " 'jeep',\n",
+       " 'towed',\n",
+       " 'dealerships',\n",
+       " 'motorbike',\n",
+       " 'limo',\n",
+       " 'Lamborghini',\n",
+       " 'minivan',\n",
+       " 'limousine',\n",
+       " 'motorist',\n",
+       " 'hatchback',\n",
+       " 'windscreen',\n",
+       " 'chauffeur',\n",
+       " 'motorhome',\n",
+       " 'bmw',\n",
+       " 'supercar',\n",
+       " 'roadster',\n",
+       " 'car.',\n",
+       " 'minibus',\n",
+       " 'moped',\n",
+       " 'racecar',\n",
+       " 'suv',\n",
+       " 'drivable',\n",
+       " 'go-kart',\n",
+       " 'sportscar',\n",
+       " 'car.The',\n",
+       " 'mini-van',\n",
+       " 'cars.',\n",
+       " 'vehicle.',\n",
+       " 'camry',\n",
+       " 'car-',\n",
+       " 'car.I',\n",
+       " 'motorcar',\n",
+       " 'bimmer',\n",
+       " 'go-cart',\n",
+       " 'vehicle.The',\n",
+       " 'cars.The',\n",
+       " 'jalopy',\n",
+       " 'motorcyle',\n",
+       " 'sports-car',\n",
+       " 'car.We',\n",
+       " 'car.It',\n",
+       " 'car-related',\n",
+       " 'rent-a-car',\n",
+       " 'car.This',\n",
+       " 'tow-truck',\n",
+       " 'vehical',\n",
+       " 'beemer',\n",
+       " 'cars-',\n",
+       " 'car.A',\n",
+       " 'cars.I',\n",
+       " 'vechicle',\n",
+       " 'motocycle',\n",
+       " 'econobox',\n",
+       " 'car.If',\n",
+       " 'vehicl',\n",
+       " 'car.In',\n",
+       " 'vechile',\n",
+       " 'non-car',\n",
+       " 'rental-car',\n",
+       " 'car.You',\n",
+       " 'carI',\n",
+       " 'undriveable',\n",
+       " 'car.He',\n",
+       " 'car.And',\n",
+       " 'car.But',\n",
+       " 'vehicule',\n",
+       " 'car.So',\n",
+       " 'car.As',\n",
+       " 'valeted',\n",
+       " 'car-share',\n",
+       " 'car.My',\n",
+       " 'thecar',\n",
+       " 'car.There',\n",
+       " 'undrivable',\n",
+       " 'car.When',\n",
+       " 'vehicle.This',\n",
+       " 'carA',\n",
+       " 'hovercar',\n",
+       " 'cars.This',\n",
+       " 'vechicles',\n",
+       " 'vehcile',\n",
+       " 'carand',\n",
+       " 'motor-bike',\n",
+       " 'vehichle',\n",
+       " 'mini-car',\n",
+       " 'hooptie',\n",
+       " 'cars.It',\n",
+       " 'stationwagon',\n",
+       " 'stickshift',\n",
+       " 'car.What',\n",
+       " 'hoopty',\n",
+       " 'vehicle.A',\n",
+       " 'cars.In',\n",
+       " 'rearended',\n",
+       " 'vehicle.We',\n",
+       " 'car.That',\n",
+       " 'car.How',\n",
+       " 'car--',\n",
+       " 'Mini-Cooper',\n",
+       " '-Car',\n",
+       " 'car.She',\n",
+       " 'car.At',\n",
+       " 'car.They',\n",
+       " 'vehicle.It',\n",
+       " 'car.For',\n",
+       " 'car.After',\n",
+       " 'cars.But',\n",
+       " '-car',\n",
+       " 'car.On',\n",
+       " 'super-car',\n",
+       " 'cars.A',\n",
+       " 'motorscooter',\n",
+       " 'car-shopping',\n",
+       " 'station-wagon',\n",
+       " 'drive-able',\n",
+       " 'car.-',\n",
+       " 'car.All',\n",
+       " 'towtruck',\n",
+       " 'autoshop',\n",
+       " 'car.Now',\n",
+       " 'carthe',\n",
+       " 'car-shaped',\n",
+       " 'vehilce',\n",
+       " 'car-jacked',\n",
+       " 'non-BMW',\n",
+       " 'car.One',\n",
+       " 'car-owners',\n",
+       " 'motorcylce',\n",
+       " 'porshe',\n",
+       " 'cars.If',\n",
+       " 'car.Then',\n",
+       " 'car.To',\n",
+       " 'vehichles',\n",
+       " 'auto-mobile',\n",
+       " 'van.I',\n",
+       " 'carcar',\n",
+       " 'car.No',\n",
+       " 'cars.There',\n",
+       " 'car.With',\n",
+       " 'cars.And',\n",
+       " 'car.Here',\n",
+       " 'racing-car',\n",
+       " 'parked.',\n",
+       " 'car.2.',\n",
+       " 'car.Very',\n",
+       " 'car.3.',\n",
+       " 'cars.So',\n",
+       " 'car.Also',\n",
+       " 'sportcar',\n",
+       " 'cars.As',\n",
+       " 'cars.You',\n",
+       " 'car.Car',\n",
+       " 'Multi-car',\n",
+       " 'car.While',\n",
+       " 'Mini-van',\n",
+       " 'vehice',\n",
+       " 'car--and',\n",
+       " 'vehicle.When',\n",
+       " 'seat-belted',\n",
+       " 'vehicle.As',\n",
+       " 'pimpmobile',\n",
+       " 'car-owner',\n",
+       " 'car.Please',\n",
+       " 'vehicle.There',\n",
+       " 'run-about',\n",
+       " 'micro-car',\n",
+       " 'cars.For',\n",
+       " 'car.Read',\n",
+       " 'car.An',\n",
+       " 'dealship',\n",
+       " 'veichle',\n",
+       " 'car.Just',\n",
+       " 'car.Not',\n",
+       " 'Smartcar',\n",
+       " 'P-car',\n",
+       " 'vehicle.But',\n",
+       " 'cars.My',\n",
+       " 'cars.That',\n",
+       " 'suv.',\n",
+       " 'car.By',\n",
+       " 'car.Do',\n",
+       " 'vehicle.So',\n",
+       " 'car.However',\n",
+       " 'covertible',\n",
+       " 'rentacar',\n",
+       " 'repossed',\n",
+       " 'car-style',\n",
+       " 'cars.He',\n",
+       " 'car-specific',\n",
+       " 'car.and',\n",
+       " 'minvan',\n",
+       " 'CarI',\n",
+       " 'seatbelted',\n",
+       " 'vehicle.To',\n",
+       " 'cars.What',\n",
+       " 'econo-box',\n",
+       " 'yourcar',\n",
+       " 'car.Well',\n",
+       " 'cars.They',\n",
+       " 're-park',\n",
+       " 'carIf',\n",
+       " 'mom-mobile',\n",
+       " 'smartcar',\n",
+       " 'avehicle',\n",
+       " 'car.2',\n",
+       " 'car-show',\n",
+       " 'hatch-back',\n",
+       " 'cars.When',\n",
+       " 'car.Some',\n",
+       " 'vehicle.All',\n",
+       " 'mycar',\n",
+       " 'car.Once',\n",
+       " 'car.4.',\n",
+       " 'carage']"
+      ]
+     },
+     "execution_count": 36,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "en_model.closer_than(\"car\",\"truck\")"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {
@@ -787,7 +893,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 37,
    "metadata": {
     "slideshow": {
      "slide_type": "slide"
@@ -799,12 +905,12 @@
     "import numpy as np\n",
     "#GLOVE_DIR = \"./Data/glove.6B\"\n",
     "#GLOVE_DIR =\"/Users/maucher/DataSets/glove.6B\"\n",
-    "GLOVE_DIR = '/Users/johannes/DataSets/Gensim/glove/'"
+    "GLOVE_DIR = '/Users/johannes/DataSets/Gensim/glove'"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 38,
    "metadata": {
     "slideshow": {
      "slide_type": "slide"
@@ -815,7 +921,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "/var/folders/68/9tltm6l520v0stj3qjlc5v9w0000gn/T/ipykernel_24511/1701203096.py:8: DeprecationWarning: Call to deprecated `glove2word2vec` (KeyedVectors.load_word2vec_format(.., binary=False, no_header=True) loads GLoVE text vectors.).\n",
+      "/var/folders/68/9tltm6l520v0stj3qjlc5v9w0000gn/T/ipykernel_7221/1167505750.py:8: DeprecationWarning: Call to deprecated `glove2word2vec` (KeyedVectors.load_word2vec_format(.., binary=False, no_header=True) loads GLoVE text vectors.).\n",
       "  _ = glove2word2vec(glove_file, tmp_file)\n"
      ]
     }
@@ -825,7 +931,7 @@
     "from gensim.models import KeyedVectors\n",
     "from gensim.scripts.glove2word2vec import glove2word2vec\n",
     "\n",
-    "glove_file = datapath(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'))\n",
+    "glove_file = datapath(os.path.join(GLOVE_DIR, 'glove.6B.50d.txt'))\n",
     "tmp_file = get_tmpfile(os.path.join(GLOVE_DIR, 'test_word2vec.txt'))\n",
     "\n",
     "_ = glove2word2vec(glove_file, tmp_file)\n",
@@ -834,7 +940,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 39,
    "metadata": {
     "slideshow": {
      "slide_type": "slide"
@@ -844,19 +950,19 @@
     {
      "data": {
       "text/plain": [
-       "[('vehicle', 0.8630837798118591),\n",
-       " ('truck', 0.8597878813743591),\n",
-       " ('cars', 0.837166965007782),\n",
-       " ('driver', 0.8185911178588867),\n",
-       " ('driving', 0.7812635898590088),\n",
-       " ('motorcycle', 0.7553157210350037),\n",
-       " ('vehicles', 0.7462256550788879),\n",
-       " ('parked', 0.74594646692276),\n",
-       " ('bus', 0.7372707724571228),\n",
-       " ('taxi', 0.7155268788337708)]"
+       "[('truck', 0.92085862159729),\n",
+       " ('cars', 0.8870189785957336),\n",
+       " ('vehicle', 0.8833683729171753),\n",
+       " ('driver', 0.8464019298553467),\n",
+       " ('driving', 0.8384189009666443),\n",
+       " ('bus', 0.8210511803627014),\n",
+       " ('vehicles', 0.8174992799758911),\n",
+       " ('parked', 0.7902189493179321),\n",
+       " ('motorcycle', 0.7866503000259399),\n",
+       " ('taxi', 0.7833929657936096)]"
       ]
      },
-     "execution_count": 16,
+     "execution_count": 39,
      "metadata": {},
      "output_type": "execute_result"
     }
-- 
GitLab