From ea75172611277b7399715afe41ade1d523df08f7 Mon Sep 17 00:00:00 2001 From: Johannes Maucher <johannes@Johanness-iMac.fritz.box> Date: Thu, 29 Sep 2022 16:29:01 +0200 Subject: [PATCH] mc --- 11ModellingWordsAndTexts.ipynb | 838 +++++++++++++++++++-------------- 1 file changed, 472 insertions(+), 366 deletions(-) diff --git a/11ModellingWordsAndTexts.ipynb b/11ModellingWordsAndTexts.ipynb index 0ead3ce..e8e1d6d 100644 --- a/11ModellingWordsAndTexts.ipynb +++ b/11ModellingWordsAndTexts.ipynb @@ -60,11 +60,13 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 32, "metadata": {}, "outputs": [], "source": [ - "import pandas as pd" + "import pandas as pd\n", + "import numpy as np\n", + "np.set_printoptions(suppress=True,precision=3)" ] }, { @@ -76,104 +78,13 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": { "slideshow": { "slide_type": "slide" } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Word Index:\n" - ] - }, - { - "data": { - "text/html": [ - "<div>\n", - "<style scoped>\n", - " .dataframe tbody tr th:only-of-type {\n", - " vertical-align: middle;\n", - " }\n", - "\n", - " .dataframe tbody tr th {\n", - " vertical-align: top;\n", - " }\n", - "\n", - " .dataframe thead th {\n", - " text-align: right;\n", - " }\n", - "</style>\n", - "<table border=\"1\" class=\"dataframe\">\n", - " <thead>\n", - " <tr style=\"text-align: right;\">\n", - " <th></th>\n", - " <th>0</th>\n", - " </tr>\n", - " </thead>\n", - " <tbody>\n", - " <tr>\n", - " <th>0</th>\n", - " <td>all</td>\n", - " </tr>\n", - " <tr>\n", - " <th>1</th>\n", - " <td>and</td>\n", - " </tr>\n", - " <tr>\n", - " <th>2</th>\n", - " <td>at</td>\n", - " </tr>\n", - " <tr>\n", - " <th>3</th>\n", - " <td>boys</td>\n", - " </tr>\n", - " <tr>\n", - " <th>4</th>\n", - " <td>girls</td>\n", - " </tr>\n", - " <tr>\n", - " <th>5</th>\n", - " <td>home</td>\n", - " </tr>\n", - " <tr>\n", - " <th>6</th>\n", - " <td>kids</td>\n", - " </tr>\n", - " <tr>\n", - " <th>7</th>\n", - " <td>not</td>\n", - " </tr>\n", - " <tr>\n", - " <th>8</th>\n", - " <td>stay</td>\n", - " </tr>\n", - " </tbody>\n", - "</table>\n", - "</div>" - ], - "text/plain": [ - " 0\n", - "0 all\n", - "1 and\n", - "2 at\n", - "3 boys\n", - "4 girls\n", - "5 home\n", - "6 kids\n", - "7 not\n", - "8 stay" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "simpleWordDF=pd.DataFrame(data=[\"all\", \"and\", \"at\", \"boys\", \"girls\", \"home\", \"kids\", \"not\", \"stay\"])\n", "print(\"\\nWord Index:\")\n", @@ -182,184 +93,13 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": { "slideshow": { "slide_type": "slide" } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Corresponding One-Hot-Encoding\n" - ] - }, - { - "data": { - "text/html": [ - "<div>\n", - "<style scoped>\n", - " .dataframe tbody tr th:only-of-type {\n", - " vertical-align: middle;\n", - " }\n", - "\n", - " .dataframe tbody tr th {\n", - " vertical-align: top;\n", - " }\n", - "\n", - " .dataframe thead th {\n", - " text-align: right;\n", - " }\n", - "</style>\n", - "<table border=\"1\" class=\"dataframe\">\n", - " <thead>\n", - " <tr style=\"text-align: right;\">\n", - " <th></th>\n", - " <th>_all</th>\n", - " <th>_and</th>\n", - " <th>_at</th>\n", - " <th>_boys</th>\n", - " <th>_girls</th>\n", - " <th>_home</th>\n", - " <th>_kids</th>\n", - " <th>_not</th>\n", - " <th>_stay</th>\n", - " </tr>\n", - " </thead>\n", - " <tbody>\n", - " <tr>\n", - " <th>0</th>\n", - " <td>1</td>\n", - " <td>0</td>\n", - " <td>0</td>\n", - " <td>0</td>\n", - " <td>0</td>\n", - " <td>0</td>\n", - " <td>0</td>\n", - " <td>0</td>\n", - " <td>0</td>\n", - " </tr>\n", - " <tr>\n", - " <th>1</th>\n", - " <td>0</td>\n", - " <td>1</td>\n", - " <td>0</td>\n", - " <td>0</td>\n", - " <td>0</td>\n", - " <td>0</td>\n", - " <td>0</td>\n", - " <td>0</td>\n", - " <td>0</td>\n", - " </tr>\n", - " <tr>\n", - " <th>2</th>\n", - " <td>0</td>\n", - " <td>0</td>\n", - " <td>1</td>\n", - " <td>0</td>\n", - " <td>0</td>\n", - " <td>0</td>\n", - " <td>0</td>\n", - " <td>0</td>\n", - " <td>0</td>\n", - " </tr>\n", - " <tr>\n", - " <th>3</th>\n", - " <td>0</td>\n", - " <td>0</td>\n", - " <td>0</td>\n", - " <td>1</td>\n", - " <td>0</td>\n", - " <td>0</td>\n", - " <td>0</td>\n", - " <td>0</td>\n", - " <td>0</td>\n", - " </tr>\n", - " <tr>\n", - " <th>4</th>\n", - " <td>0</td>\n", - " <td>0</td>\n", - " <td>0</td>\n", - " <td>0</td>\n", - " <td>1</td>\n", - " <td>0</td>\n", - " <td>0</td>\n", - " <td>0</td>\n", - " <td>0</td>\n", - " </tr>\n", - " <tr>\n", - " <th>5</th>\n", - " <td>0</td>\n", - " <td>0</td>\n", - " <td>0</td>\n", - " <td>0</td>\n", - " <td>0</td>\n", - " <td>1</td>\n", - " <td>0</td>\n", - " <td>0</td>\n", - " <td>0</td>\n", - " </tr>\n", - " <tr>\n", - " <th>6</th>\n", - " <td>0</td>\n", - " <td>0</td>\n", - " <td>0</td>\n", - " <td>0</td>\n", - " <td>0</td>\n", - " <td>0</td>\n", - " <td>1</td>\n", - " <td>0</td>\n", - " <td>0</td>\n", - " </tr>\n", - " <tr>\n", - " <th>7</th>\n", - " <td>0</td>\n", - " <td>0</td>\n", - " <td>0</td>\n", - " <td>0</td>\n", - " <td>0</td>\n", - " <td>0</td>\n", - " <td>0</td>\n", - " <td>1</td>\n", - " <td>0</td>\n", - " </tr>\n", - " <tr>\n", - " <th>8</th>\n", - " <td>0</td>\n", - " <td>0</td>\n", - " <td>0</td>\n", - " <td>0</td>\n", - " <td>0</td>\n", - " <td>0</td>\n", - " <td>0</td>\n", - " <td>0</td>\n", - " <td>1</td>\n", - " </tr>\n", - " </tbody>\n", - "</table>\n", - "</div>" - ], - "text/plain": [ - " _all _and _at _boys _girls _home _kids _not _stay\n", - "0 1 0 0 0 0 0 0 0 0\n", - "1 0 1 0 0 0 0 0 0 0\n", - "2 0 0 1 0 0 0 0 0 0\n", - "3 0 0 0 1 0 0 0 0 0\n", - "4 0 0 0 0 1 0 0 0 0\n", - "5 0 0 0 0 0 1 0 0 0\n", - "6 0 0 0 0 0 0 1 0 0\n", - "7 0 0 0 0 0 0 0 1 0\n", - "8 0 0 0 0 0 0 0 0 1" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "print(\"\\nCorresponding One-Hot-Encoding\")\n", "pd.get_dummies(simpleWordDF,prefix=\"\")" @@ -421,7 +161,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": { "slideshow": { "slide_type": "slide" @@ -435,32 +175,13 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": { "slideshow": { "slide_type": "slide" } }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/johannes/opt/anaconda3/envs/books/lib/python3.8/site-packages/sklearn/utils/deprecation.py:87: FutureWarning: Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.\n", - " warnings.warn(msg, category=FutureWarning)\n" - ] - }, - { - "data": { - "text/plain": [ - "['all', 'and', 'at', 'boys', 'girls', 'home', 'kids', 'not', 'stay']" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "corpus = ['not all kids stay at home.',\n", " 'all boys and girls stay not at home.',\n", @@ -471,25 +192,13 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": { "slideshow": { "slide_type": "slide" } }, - "outputs": [ - { - "data": { - "text/plain": [ - "array([[1, 0, 1, 0, 0, 1, 1, 1, 1],\n", - " [1, 1, 1, 1, 1, 1, 0, 1, 1]])" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "BoW.toarray()" ] @@ -517,7 +226,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": { "slideshow": { "slide_type": "slide" @@ -531,7 +240,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": { "slideshow": { "slide_type": "slide" @@ -544,27 +253,13 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "metadata": { "slideshow": { "slide_type": "slide" } }, - "outputs": [ - { - "data": { - "text/plain": [ - "array([[0.37863221, 0. , 0.37863221, 0. , 0. ,\n", - " 0.37863221, 0.53215436, 0.37863221, 0.37863221],\n", - " [0.30253071, 0.42519636, 0.30253071, 0.42519636, 0.42519636,\n", - " 0.30253071, 0. , 0.30253071, 0.30253071]])" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "tfidf_BoW.toarray()" ] @@ -577,7 +272,13 @@ } }, "source": [ - "As can be seen in the example above, words which appear in all documents are weighted by 0, i.e. they are considered to be not relevant." + "By inspecting the output of the previous cell, you may realize that this output doesn't fit to the expectation. We expect the tf-idf value of words which appear in all documents are 0. However, in the example above such words have non-zero tf-idf-values. The reason for this is that **in scikit-learn** the tf-idf is implemented in a slightly different way. As described in [scikit-learn TfidfVectorizer](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfTransformer.html) scikit-learn calculates the tf-idf as follows:\n", + "\n", + "$$\n", + "tfidf_{i,j}=tf_{i,j} \\cdot ( log \\frac{N}{df_j} + 1),\n", + "$$ \n", + "\n", + "Moreover, from the output of the previous cell we see that the tf-idf-values are normalized such that each vector has an L2-norm of 1." ] }, { @@ -667,7 +368,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 20, "metadata": { "slideshow": { "slide_type": "slide" @@ -678,7 +379,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Number of Tokens: 999994\n", + "Number of Tokens: 2000000\n", "Dimension of a word vector: 300\n" ] } @@ -704,29 +405,166 @@ "))" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Show the words of the first 10 index-positions:" + ] + }, { "cell_type": "code", - "execution_count": 12, - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, + "execution_count": 30, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "than\n", - "First 10 components of word-vector: \n", - " [ 0.1016 -0.1216 -0.0356 0.0096 -0.1015 0.1766 -0.0593 0.032 0.0892\n", - " -0.0727]\n" + "0 ,\n", + "1 the\n", + "2 .\n", + "3 and\n", + "4 to\n", + "5 of\n", + "6 a\n", + "7 </s>\n", + "8 in\n", + "9 is\n" ] } ], "source": [ - "print(words[100])\n", - "print(\"First 10 components of word-vector: \\n\",en_model[words[100]][:10])" + "for i in range(10):\n", + " print(i,en_model.index_to_key[i])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Word vector of word at index 1:" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([-0.052, 0.074, -0.013, 0.045, -0.034, 0.021, 0.007, -0.016,\n", + " -0.018, -0.002, -0.102, 0.006, 0.026, -0.003, -0.059, -0.038,\n", + " 0.016, 0.015, -0.009, -0.018, -0.009, -0.008, -0.018, 0.009,\n", + " 0.001, -0.094, 0.014, 0.015, -0.039, -0.029, 0.009, -0.025,\n", + " -0.01 , -0.221, -0.023, -0.009, -0.032, 0.082, 0.002, 0.028,\n", + " 0.007, -0.009, -0.035, -0.018, -0.071, 0.063, -0.009, -0.022,\n", + " -0.006, 0.052, -0.031, 0.044, -0.011, -0.056, 0.009, -0.067,\n", + " 0.01 , 0.057, 0.01 , -0.028, 0.047, 0.005, 0.003, 0.001,\n", + " 0.044, 0.007, -0.033, 0.009, -0.008, 0.007, 0.092, 0.031,\n", + " 0.054, 0.028, -0.02 , -0.033, 0.005, 0.036, 0.225, 0.093,\n", + " -0.012, 0.009, -0.06 , 0.068, 0.04 , 0.001, 0.046, -0.044,\n", + " 0.006, 0.092, -0.041, -0.015, -0.023, 0.009, 0.059, 0.028,\n", + " 0.065, -0.057, -0.013, 0.047, 0.035, -0.012, -0.008, -0.131,\n", + " 0.013, -0.051, 0.011, 0.012, -0.022, 0.039, 0.022, 0.024,\n", + " 0.004, 0.115, 0.023, -0.047, -0.046, -0.019, 0.008, -0.03 ,\n", + " -0.035, -0.029, -0.04 , 0.024, -0.01 , 0.058, -0.039, -0.012,\n", + " -0.03 , 0.247, -0.011, 0.036, 0.005, 0.209, -0.102, 0.034,\n", + " 0.069, -0.071, 0.027, -0.042, 0.008, -0.027, 0.007, 0.004,\n", + " 0.035, -0.006, -0.446, 0.01 , -0.012, -0.045, -0.17 , 0.05 ,\n", + " 0.093, -0.004, -0.004, 0.032, 0.203, 0.061, -0.03 , 0.023,\n", + " -0.019, 0.017, 0.148, -0.018, -0.013, 0.069, 0.033, -0.03 ,\n", + " 0.043, 0.005, 0.023, 0.01 , 0.073, 0.008, -0.005, 0.054,\n", + " -0.032, 0.051, 0.029, -0.059, -0. , 0.049, 0.017, -0.014,\n", + " 0.036, 0.054, -0.001, -0.059, 0.016, -0.022, -0.02 , 0.023,\n", + " -0.068, 0.018, 0.003, 0.011, 0.047, -0.044, 0.032, 0.02 ,\n", + " -0.065, 0.339, 0.07 , -0.022, -0.024, -0.003, -0.003, -0.062,\n", + " 0.012, 0.038, -0.02 , 0.024, -0.088, 0.02 , -0.006, -0.026,\n", + " -0.019, -0.026, 0.019, -0.042, 0.025, 0.083, -0.01 , 0.129,\n", + " 0.062, 0.054, 0.019, 0.042, 0.18 , -0.001, -0.033, -0.056,\n", + " -0.016, 0.049, 0.035, -0.042, 0.016, -0.077, -0.066, 0.05 ,\n", + " 0.01 , 0.147, -0.071, -0.147, 0.474, -0.017, -0.005, 0.016,\n", + " 0.055, -0.063, -0.021, 0.012, 0.027, 0.006, 0.066, 0.011,\n", + " -0.071, -0.021, -0.078, -0.029, -0.028, -0.157, -0.039, 0.005,\n", + " 0.02 , -0.003, 0.044, 0.028, -0.039, 0.037, -0.004, -0.016,\n", + " -0.073, -0.164, 0.065, -0.006, -0.065, -0.198, -0.041, -0.153,\n", + " 0.002, 0.013, -0.236, -0.053, -0.004, -0.045, 0.011, -0.033,\n", + " -0.055, 0.001, 0.017, -0.044, -0.058, 0.022, -0.078, -0.043,\n", + " -0.025, 0.237, 0. , -0.004], dtype=float32)" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "en_model[words[1]]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Word vector of word *the*:" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([-0.052, 0.074, -0.013, 0.045, -0.034, 0.021, 0.007, -0.016,\n", + " -0.018, -0.002, -0.102, 0.006, 0.026, -0.003, -0.059, -0.038,\n", + " 0.016, 0.015, -0.009, -0.018, -0.009, -0.008, -0.018, 0.009,\n", + " 0.001, -0.094, 0.014, 0.015, -0.039, -0.029, 0.009, -0.025,\n", + " -0.01 , -0.221, -0.023, -0.009, -0.032, 0.082, 0.002, 0.028,\n", + " 0.007, -0.009, -0.035, -0.018, -0.071, 0.063, -0.009, -0.022,\n", + " -0.006, 0.052, -0.031, 0.044, -0.011, -0.056, 0.009, -0.067,\n", + " 0.01 , 0.057, 0.01 , -0.028, 0.047, 0.005, 0.003, 0.001,\n", + " 0.044, 0.007, -0.033, 0.009, -0.008, 0.007, 0.092, 0.031,\n", + " 0.054, 0.028, -0.02 , -0.033, 0.005, 0.036, 0.225, 0.093,\n", + " -0.012, 0.009, -0.06 , 0.068, 0.04 , 0.001, 0.046, -0.044,\n", + " 0.006, 0.092, -0.041, -0.015, -0.023, 0.009, 0.059, 0.028,\n", + " 0.065, -0.057, -0.013, 0.047, 0.035, -0.012, -0.008, -0.131,\n", + " 0.013, -0.051, 0.011, 0.012, -0.022, 0.039, 0.022, 0.024,\n", + " 0.004, 0.115, 0.023, -0.047, -0.046, -0.019, 0.008, -0.03 ,\n", + " -0.035, -0.029, -0.04 , 0.024, -0.01 , 0.058, -0.039, -0.012,\n", + " -0.03 , 0.247, -0.011, 0.036, 0.005, 0.209, -0.102, 0.034,\n", + " 0.069, -0.071, 0.027, -0.042, 0.008, -0.027, 0.007, 0.004,\n", + " 0.035, -0.006, -0.446, 0.01 , -0.012, -0.045, -0.17 , 0.05 ,\n", + " 0.093, -0.004, -0.004, 0.032, 0.203, 0.061, -0.03 , 0.023,\n", + " -0.019, 0.017, 0.148, -0.018, -0.013, 0.069, 0.033, -0.03 ,\n", + " 0.043, 0.005, 0.023, 0.01 , 0.073, 0.008, -0.005, 0.054,\n", + " -0.032, 0.051, 0.029, -0.059, -0. , 0.049, 0.017, -0.014,\n", + " 0.036, 0.054, -0.001, -0.059, 0.016, -0.022, -0.02 , 0.023,\n", + " -0.068, 0.018, 0.003, 0.011, 0.047, -0.044, 0.032, 0.02 ,\n", + " -0.065, 0.339, 0.07 , -0.022, -0.024, -0.003, -0.003, -0.062,\n", + " 0.012, 0.038, -0.02 , 0.024, -0.088, 0.02 , -0.006, -0.026,\n", + " -0.019, -0.026, 0.019, -0.042, 0.025, 0.083, -0.01 , 0.129,\n", + " 0.062, 0.054, 0.019, 0.042, 0.18 , -0.001, -0.033, -0.056,\n", + " -0.016, 0.049, 0.035, -0.042, 0.016, -0.077, -0.066, 0.05 ,\n", + " 0.01 , 0.147, -0.071, -0.147, 0.474, -0.017, -0.005, 0.016,\n", + " 0.055, -0.063, -0.021, 0.012, 0.027, 0.006, 0.066, 0.011,\n", + " -0.071, -0.021, -0.078, -0.029, -0.028, -0.157, -0.039, 0.005,\n", + " 0.02 , -0.003, 0.044, 0.028, -0.039, 0.037, -0.004, -0.016,\n", + " -0.073, -0.164, 0.065, -0.006, -0.065, -0.198, -0.041, -0.153,\n", + " 0.002, 0.013, -0.236, -0.053, -0.004, -0.045, 0.011, -0.033,\n", + " -0.055, 0.001, 0.017, -0.044, -0.058, 0.022, -0.078, -0.043,\n", + " -0.025, 0.237, 0. , -0.004], dtype=float32)" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "en_model.get_vector(\"the\")" ] }, { @@ -742,7 +580,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 22, "metadata": { "slideshow": { "slide_type": "slide" @@ -752,19 +590,19 @@ { "data": { "text/plain": [ - "[('cars', 0.8045915365219116),\n", - " ('automobile', 0.7667388916015625),\n", - " ('vehicle', 0.7534859776496887),\n", - " ('Car', 0.7177952527999878),\n", - " ('truck', 0.6989946961402893),\n", - " ('SUV', 0.6896128058433533),\n", - " ('automobiles', 0.6783526539802551),\n", - " ('dealership', 0.6682884097099304),\n", - " ('garage', 0.6681075096130371),\n", - " ('driver', 0.6541328430175781)]" + "[('cars', 0.73371422290802),\n", + " ('vehicle', 0.7271659970283508),\n", + " ('automobile', 0.7021709680557251),\n", + " ('car--and', 0.7012600302696228),\n", + " ('car.But', 0.6894583106040955),\n", + " ('car.It', 0.6796760559082031),\n", + " ('car.So', 0.679090142250061),\n", + " ('car.Now', 0.6771497130393982),\n", + " ('car.', 0.6755067706108093),\n", + " ('car.When', 0.6720768809318542)]" ] }, - "execution_count": 13, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -773,6 +611,274 @@ "en_model.most_similar(\"car\")" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Show all words, whose vector is closer to the vector of *car* than the vector of *lorry*:" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['vehicle',\n", + " 'cars',\n", + " 'driving',\n", + " 'driver',\n", + " 'bike',\n", + " 'vehicles',\n", + " 'Car',\n", + " 'truck',\n", + " 'rental',\n", + " 'garage',\n", + " 'auto',\n", + " 'passenger',\n", + " 'BMW',\n", + " 'motorcycle',\n", + " 'taxi',\n", + " 'bicycle',\n", + " 'automotive',\n", + " 'Audi',\n", + " 'automobile',\n", + " 'SUV',\n", + " 'parked',\n", + " 'driveway',\n", + " 'dealership',\n", + " 'sedan',\n", + " 'windshield',\n", + " 'automobiles',\n", + " 'scooter',\n", + " 'coupe',\n", + " 'jeep',\n", + " 'towed',\n", + " 'dealerships',\n", + " 'motorbike',\n", + " 'limo',\n", + " 'Lamborghini',\n", + " 'minivan',\n", + " 'limousine',\n", + " 'motorist',\n", + " 'hatchback',\n", + " 'windscreen',\n", + " 'chauffeur',\n", + " 'motorhome',\n", + " 'bmw',\n", + " 'supercar',\n", + " 'roadster',\n", + " 'car.',\n", + " 'minibus',\n", + " 'moped',\n", + " 'racecar',\n", + " 'suv',\n", + " 'drivable',\n", + " 'go-kart',\n", + " 'sportscar',\n", + " 'car.The',\n", + " 'mini-van',\n", + " 'cars.',\n", + " 'vehicle.',\n", + " 'camry',\n", + " 'car-',\n", + " 'car.I',\n", + " 'motorcar',\n", + " 'bimmer',\n", + " 'go-cart',\n", + " 'vehicle.The',\n", + " 'cars.The',\n", + " 'jalopy',\n", + " 'motorcyle',\n", + " 'sports-car',\n", + " 'car.We',\n", + " 'car.It',\n", + " 'car-related',\n", + " 'rent-a-car',\n", + " 'car.This',\n", + " 'tow-truck',\n", + " 'vehical',\n", + " 'beemer',\n", + " 'cars-',\n", + " 'car.A',\n", + " 'cars.I',\n", + " 'vechicle',\n", + " 'motocycle',\n", + " 'econobox',\n", + " 'car.If',\n", + " 'vehicl',\n", + " 'car.In',\n", + " 'vechile',\n", + " 'non-car',\n", + " 'rental-car',\n", + " 'car.You',\n", + " 'carI',\n", + " 'undriveable',\n", + " 'car.He',\n", + " 'car.And',\n", + " 'car.But',\n", + " 'vehicule',\n", + " 'car.So',\n", + " 'car.As',\n", + " 'valeted',\n", + " 'car-share',\n", + " 'car.My',\n", + " 'thecar',\n", + " 'car.There',\n", + " 'undrivable',\n", + " 'car.When',\n", + " 'vehicle.This',\n", + " 'carA',\n", + " 'hovercar',\n", + " 'cars.This',\n", + " 'vechicles',\n", + " 'vehcile',\n", + " 'carand',\n", + " 'motor-bike',\n", + " 'vehichle',\n", + " 'mini-car',\n", + " 'hooptie',\n", + " 'cars.It',\n", + " 'stationwagon',\n", + " 'stickshift',\n", + " 'car.What',\n", + " 'hoopty',\n", + " 'vehicle.A',\n", + " 'cars.In',\n", + " 'rearended',\n", + " 'vehicle.We',\n", + " 'car.That',\n", + " 'car.How',\n", + " 'car--',\n", + " 'Mini-Cooper',\n", + " '-Car',\n", + " 'car.She',\n", + " 'car.At',\n", + " 'car.They',\n", + " 'vehicle.It',\n", + " 'car.For',\n", + " 'car.After',\n", + " 'cars.But',\n", + " '-car',\n", + " 'car.On',\n", + " 'super-car',\n", + " 'cars.A',\n", + " 'motorscooter',\n", + " 'car-shopping',\n", + " 'station-wagon',\n", + " 'drive-able',\n", + " 'car.-',\n", + " 'car.All',\n", + " 'towtruck',\n", + " 'autoshop',\n", + " 'car.Now',\n", + " 'carthe',\n", + " 'car-shaped',\n", + " 'vehilce',\n", + " 'car-jacked',\n", + " 'non-BMW',\n", + " 'car.One',\n", + " 'car-owners',\n", + " 'motorcylce',\n", + " 'porshe',\n", + " 'cars.If',\n", + " 'car.Then',\n", + " 'car.To',\n", + " 'vehichles',\n", + " 'auto-mobile',\n", + " 'van.I',\n", + " 'carcar',\n", + " 'car.No',\n", + " 'cars.There',\n", + " 'car.With',\n", + " 'cars.And',\n", + " 'car.Here',\n", + " 'racing-car',\n", + " 'parked.',\n", + " 'car.2.',\n", + " 'car.Very',\n", + " 'car.3.',\n", + " 'cars.So',\n", + " 'car.Also',\n", + " 'sportcar',\n", + " 'cars.As',\n", + " 'cars.You',\n", + " 'car.Car',\n", + " 'Multi-car',\n", + " 'car.While',\n", + " 'Mini-van',\n", + " 'vehice',\n", + " 'car--and',\n", + " 'vehicle.When',\n", + " 'seat-belted',\n", + " 'vehicle.As',\n", + " 'pimpmobile',\n", + " 'car-owner',\n", + " 'car.Please',\n", + " 'vehicle.There',\n", + " 'run-about',\n", + " 'micro-car',\n", + " 'cars.For',\n", + " 'car.Read',\n", + " 'car.An',\n", + " 'dealship',\n", + " 'veichle',\n", + " 'car.Just',\n", + " 'car.Not',\n", + " 'Smartcar',\n", + " 'P-car',\n", + " 'vehicle.But',\n", + " 'cars.My',\n", + " 'cars.That',\n", + " 'suv.',\n", + " 'car.By',\n", + " 'car.Do',\n", + " 'vehicle.So',\n", + " 'car.However',\n", + " 'covertible',\n", + " 'rentacar',\n", + " 'repossed',\n", + " 'car-style',\n", + " 'cars.He',\n", + " 'car-specific',\n", + " 'car.and',\n", + " 'minvan',\n", + " 'CarI',\n", + " 'seatbelted',\n", + " 'vehicle.To',\n", + " 'cars.What',\n", + " 'econo-box',\n", + " 'yourcar',\n", + " 'car.Well',\n", + " 'cars.They',\n", + " 're-park',\n", + " 'carIf',\n", + " 'mom-mobile',\n", + " 'smartcar',\n", + " 'avehicle',\n", + " 'car.2',\n", + " 'car-show',\n", + " 'hatch-back',\n", + " 'cars.When',\n", + " 'car.Some',\n", + " 'vehicle.All',\n", + " 'mycar',\n", + " 'car.Once',\n", + " 'car.4.',\n", + " 'carage']" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "en_model.closer_than(\"car\",\"truck\")" + ] + }, { "cell_type": "markdown", "metadata": { @@ -787,7 +893,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 37, "metadata": { "slideshow": { "slide_type": "slide" @@ -799,12 +905,12 @@ "import numpy as np\n", "#GLOVE_DIR = \"./Data/glove.6B\"\n", "#GLOVE_DIR =\"/Users/maucher/DataSets/glove.6B\"\n", - "GLOVE_DIR = '/Users/johannes/DataSets/Gensim/glove/'" + "GLOVE_DIR = '/Users/johannes/DataSets/Gensim/glove'" ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 38, "metadata": { "slideshow": { "slide_type": "slide" @@ -815,7 +921,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/var/folders/68/9tltm6l520v0stj3qjlc5v9w0000gn/T/ipykernel_24511/1701203096.py:8: DeprecationWarning: Call to deprecated `glove2word2vec` (KeyedVectors.load_word2vec_format(.., binary=False, no_header=True) loads GLoVE text vectors.).\n", + "/var/folders/68/9tltm6l520v0stj3qjlc5v9w0000gn/T/ipykernel_7221/1167505750.py:8: DeprecationWarning: Call to deprecated `glove2word2vec` (KeyedVectors.load_word2vec_format(.., binary=False, no_header=True) loads GLoVE text vectors.).\n", " _ = glove2word2vec(glove_file, tmp_file)\n" ] } @@ -825,7 +931,7 @@ "from gensim.models import KeyedVectors\n", "from gensim.scripts.glove2word2vec import glove2word2vec\n", "\n", - "glove_file = datapath(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'))\n", + "glove_file = datapath(os.path.join(GLOVE_DIR, 'glove.6B.50d.txt'))\n", "tmp_file = get_tmpfile(os.path.join(GLOVE_DIR, 'test_word2vec.txt'))\n", "\n", "_ = glove2word2vec(glove_file, tmp_file)\n", @@ -834,7 +940,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 39, "metadata": { "slideshow": { "slide_type": "slide" @@ -844,19 +950,19 @@ { "data": { "text/plain": [ - "[('vehicle', 0.8630837798118591),\n", - " ('truck', 0.8597878813743591),\n", - " ('cars', 0.837166965007782),\n", - " ('driver', 0.8185911178588867),\n", - " ('driving', 0.7812635898590088),\n", - " ('motorcycle', 0.7553157210350037),\n", - " ('vehicles', 0.7462256550788879),\n", - " ('parked', 0.74594646692276),\n", - " ('bus', 0.7372707724571228),\n", - " ('taxi', 0.7155268788337708)]" + "[('truck', 0.92085862159729),\n", + " ('cars', 0.8870189785957336),\n", + " ('vehicle', 0.8833683729171753),\n", + " ('driver', 0.8464019298553467),\n", + " ('driving', 0.8384189009666443),\n", + " ('bus', 0.8210511803627014),\n", + " ('vehicles', 0.8174992799758911),\n", + " ('parked', 0.7902189493179321),\n", + " ('motorcycle', 0.7866503000259399),\n", + " ('taxi', 0.7833929657936096)]" ] }, - "execution_count": 16, + "execution_count": 39, "metadata": {}, "output_type": "execute_result" } -- GitLab