You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
342 lines
104 KiB
342 lines
104 KiB
1 year ago
|
{
|
||
|
"cells": [
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 85,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"\n",
|
||
|
"from multiprocessing import Pool\n",
|
||
|
"from functools import partial\n",
|
||
|
"from itertools import repeat\n",
|
||
|
"\n",
|
||
|
"import pandas as pd\n",
|
||
|
"from wordfreq import word_frequency\n",
|
||
|
"\n",
|
||
|
"import sys\n",
|
||
|
"sys.path.append('../src')\n",
|
||
|
"\n",
|
||
|
"\n",
|
||
|
"from ankimaker.tasks.epub.load_epub import generate_corpus_from_epub_file\n",
|
||
|
"from ankimaker.tasks.dictionary import get_word_definitions_from_dictionary\n",
|
||
|
"\n",
|
||
|
"\n"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 86,
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"ename": "TypeError",
|
||
|
"evalue": "generate_corpus_from_epub_file() takes 1 positional argument but 2 were given",
|
||
|
"output_type": "error",
|
||
|
"traceback": [
|
||
|
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
||
|
"\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
|
||
|
"Cell \u001b[0;32mIn[86], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m file \u001b[39m=\u001b[39m \u001b[39m'\u001b[39m\u001b[39m/home/gabriel/dev/ankimaker/data/german-epub.epub\u001b[39m\u001b[39m'\u001b[39m\n\u001b[0;32m----> 3\u001b[0m words_from_epub \u001b[39m=\u001b[39m generate_corpus_from_epub_file(file, \u001b[39m'\u001b[39;49m\u001b[39mgerman\u001b[39;49m\u001b[39m'\u001b[39;49m)\n",
|
||
|
"\u001b[0;31mTypeError\u001b[0m: generate_corpus_from_epub_file() takes 1 positional argument but 2 were given"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"\n",
|
||
|
"\n",
|
||
|
"file = '/home/gabriel/dev/ankimaker/data/german-epub.epub'\n",
|
||
|
"\n",
|
||
|
"words_from_epub = generate_corpus_from_epub_file(file, 'german')"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"words_from_epub = pd.Series(words_from_epub)"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"text/plain": [
|
||
|
"momo 811\n",
|
||
|
"mehr 299\n",
|
||
|
"sagte 246\n",
|
||
|
"ja 236\n",
|
||
|
"zeit 229\n",
|
||
|
" ... \n",
|
||
|
"ansehen 1\n",
|
||
|
"stillschweigend 1\n",
|
||
|
"liebhast 1\n",
|
||
|
"liebhaben 1\n",
|
||
|
"hinkte 1\n",
|
||
|
"Length: 7785, dtype: int64"
|
||
|
]
|
||
|
},
|
||
|
"execution_count": 19,
|
||
|
"metadata": {},
|
||
|
"output_type": "execute_result"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"words_from_epub.value_counts()"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"text/plain": [
|
||
|
"0 1.200000e-04\n",
|
||
|
"1 4.680000e-04\n",
|
||
|
"2 1.510000e-06\n",
|
||
|
"3 2.690000e-05\n",
|
||
|
"4 1.740000e-04\n",
|
||
|
" ... \n",
|
||
|
"7780 1.230000e-07\n",
|
||
|
"7781 1.580000e-06\n",
|
||
|
"7782 1.230000e-05\n",
|
||
|
"7783 3.720000e-04\n",
|
||
|
"7784 1.820000e-07\n",
|
||
|
"Length: 7785, dtype: float64"
|
||
|
]
|
||
|
},
|
||
|
"execution_count": 21,
|
||
|
"metadata": {},
|
||
|
"output_type": "execute_result"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"pd.Series(words_from_epub.unique()).apply(lambda x: word_frequency(x, 'de'))"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"df = (\n",
|
||
|
" words_from_epub\n",
|
||
|
" .value_counts()\n",
|
||
|
" .to_frame()\n",
|
||
|
" .reset_index()\n",
|
||
|
" .rename(columns={'index': 'word', 0: 'epub_freq'})\n",
|
||
|
")\n",
|
||
|
"\n"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"df['global_frequencies'] = df.word.apply(lambda x: word_frequency(x, 'de'))"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"df.epub_freq = df.epub_freq / df.epub_freq.sum()"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"df['epub_rank'] = df.epub_freq.rank(method='min')"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"text/plain": [
|
||
|
"<Axes: >"
|
||
|
]
|
||
|
},
|
||
|
"execution_count": 75,
|
||
|
"metadata": {},
|
||
|
"output_type": "execute_result"
|
||
|
},
|
||
|
{
|
||
|
"data": {
|
||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAjgAAAGdCAYAAAAfTAk2AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/OQEPoAAAACXBIWXMAAA9hAAAPYQGoP6dpAAA8OElEQVR4nO3de3RU9b3//9ckIZNwmYRbZogEjKJABJSLwlS0VXKINPZ4iT1iU6SC+oMTrIEWkKp4q43FKuIFqNUSzk8oQpdYBQFjECgSLkbDVSLWaFCcBMVkACHXz/cPm11GgjIhMJPN87HWXsvsz3v2vD8dm7zcsz97O4wxRgAAADYSEeoGAAAAmhsBBwAA2A4BBwAA2A4BBwAA2A4BBwAA2A4BBwAA2A4BBwAA2A4BBwAA2E5UqBs4Xerr67Vv3z61a9dODocj1O0AAICTYIzRwYMHlZiYqIiIpp+HsW3A2bdvn5KSkkLdBgAAaIK9e/eqa9euTX69bQNOu3btJH37P5DL5QpxNwAA4GT4/X4lJSVZf8ebyrYBp+FrKZfLRcABAKCFOdXLS4L6cquurk7333+/kpOTFRsbq/PPP1+PPPKIjn1epzFG06dPV5cuXRQbG6vU1FTt2bMn4DgHDhxQZmamXC6X4uPjNXbsWB06dCigZtu2bbriiisUExOjpKQkzZgx4xSmCQAAziZBBZw//vGPmjNnjp599ll98MEH+uMf/6gZM2bomWeesWpmzJihp59+WnPnztWmTZvUpk0bpaWl6ejRo1ZNZmamdu7cqby8PC1btkzr1q3TnXfeaY37/X4NHz5c3bt3V2FhoR5//HE9+OCDev7555thygAAwO4c5tjTLz/g2muvldvt1osvvmjty8jIUGxsrF566SUZY5SYmKjf/OY3+u1vfytJqqyslNvtVm5urkaOHKkPPvhAKSkp2rJliwYNGiRJWrlypX7605/qs88+U2JioubMmaN7771XPp9P0dHRkqR77rlHr776qnbv3n1Svfr9fsXFxamyspKvqAAAaCGa6+93UGdwfvSjHyk/P18ffvihJGnr1q1av369RowYIUkqKSmRz+dTamqq9Zq4uDgNHjxYBQUFkqSCggLFx8db4UaSUlNTFRERoU2bNlk1V155pRVuJCktLU3FxcX6+uuvG+2tqqpKfr8/YAMAAGenoC4yvueee+T3+9WrVy9FRkaqrq5Ojz76qDIzMyVJPp9PkuR2uwNe53a7rTGfz6eEhITAJqKi1KFDh4Ca5OTk447RMNa+ffvjesvJydFDDz0UzHQAAIBNBXUGZ/HixVqwYIEWLlyo9957T/Pnz9ef/vQnzZ8//3T1d9KmTZumyspKa9u7d2+oWwIAACES1BmcyZMn65577tHIkSMlSX379tWnn36qnJwcjR49Wh6PR5JUVlamLl26WK8rKyvTJZdcIknyeDwqLy8POG5tba0OHDhgvd7j8aisrCygpuHnhprvcjqdcjqdwUwHAADYVFBncL755pvjbpscGRmp+vp6SVJycrI8Ho/y8/Otcb/fr02bNsnr9UqSvF6vKioqVFhYaNWsXr1a9fX1Gjx4sFWzbt061dTUWDV5eXnq2bNno19PAQAAHCuogPOzn/1Mjz76qJYvX65PPvlES5cu1ZNPPqkbbrhB0rc35cnOztbvf/97vfbaa9q+fbtuvfVWJSYm6vrrr5ck9e7dW9dcc43uuOMObd68We+8844mTJigkSNHKjExUZL0i1/8QtHR0Ro7dqx27typl19+WbNmzdKkSZOad/YAAMCeTBD8fr+5++67Tbdu3UxMTIw577zzzL333muqqqqsmvr6enP//fcbt9ttnE6nGTZsmCkuLg44zldffWVuueUW07ZtW+Nyucxtt91mDh48GFCzdetWM3ToUON0Os0555xjHnvssWBaNZWVlUaSqaysDOp1AAAgdJrr73dQ98FpSbgPDgAALU9I7oMDAADQEhBwgrR1b4V+NW+zyg8e/eFiAAAQEgScINTXG035+zatKd6vn85ar3c++jLULQEAgEYQcIIQEeHQc5n91dPdTl8eqtIvX9ykB/6xQ/X1tryMCQCAFouAE6QeCe30jwmX65bLkmSMNL/gU438y0aV+/nKCgCAcEHAaYKYVpHKubGfpl7TS5ERDm0uOaD/mrlOb2z/ItStAQAAEXBOyfifnK8Vd1+h8zq3UeWRGv3vgvd019/eV21dfahbAwDgrEbAOUUXuttp6fjL1b1ja0nS61v3acAjefr0q8Mh7gwAgLMXAacZxLVupTW//Ylu6H+OJMl/tFY/fnyN/l74WYg7AwDg7ETAaSYOh0Mzb75Ei+4cYu377ZKtuu7Z9apjlRUAAGcUAaeZDTmvo5bdNdT6eetnlTr/d29o/8GqEHYFAMDZhYBzGvQ5J067H7lGSR1irX2XPvqWXn3/8xB2BQDA2YOAc5rEtIrUP6dcrdHe7ta+7JeL9KdVxbLp800BAAgbBJzT7KHr+ij3tkutn599+yP97Nn1Oni0JoRdAQBgbwScM+AnPRO04u4rrJ93fO5X3wff1LbPKkLXFAAANkbAOUN6d3Fpy72pOif+P9fl/Pez7+iFf34cwq4AALAnAs4Z1LmdU2//9ie65bIka9/vl3+gG2a/Iz9fWQEA0GwIOGdYdFSEHr6uj2bc1M/a935phfo9+KY2lxwIYWcAANgHAScEWkVG6H8GJWnz74apS1yMtf9//lygv64vYZUVAACniIATQgmuGOX/5sf61Y/OtfY9vGyX3iv9OnRNAQBgAwScEGsdHaUHfpaiR67vY53NeeGfJfrjyt1656MvQ9wdAAAtEwEnDDgcDo0a0l0Xd42XJK3Y4dOcNf/SuP+/kK+rAABogqhQN4D/mHxNT3Xv1FpVNfXK3fCJDlbV6ta/blantk7dl95bHds6Q90iAAAtAgEnjJzfua2mjegtY4yWbftCXx6q0j/3fPs11YDu7TVqSPcfOAIAAJAIOGHJ4XBoyTiv3vv0a7387l5tLjmgLSUHFB/bSjGtInXFBZ0U0yoy1G0CABC2CDhhKrlTGyV3aqOd+/zaXHJAr23dp9e27pMk/frqHpo0vGeIOwQAIHwRcMLcLZclqfTAYR2uqtMXlUf0yVffaO/XR0LdFgAAYc1hbLpMx+/3Ky4uTpWVlXK5XKFup1ks3FSq3y3drgjHtzcLjIxw6NfDLtC4H58f6tYAAGgWzfX3m2XiLUi/rnGKjoxQvZGqauv1TXWdXv/311YAAOA/+IqqBelzTpzevT9VB4/Wqqi0QlkL39OR6jodramTJEU4HIqOIrMCAEDAaWFcMa3kimmlrw5VSZI+/vKwet2/UpIU4ZDuGdFLd17JV1YAgLMb/7nfQp3fua3OiY8N2FdvpLUf7g9RRwAAhI+gAs65554rh8Nx3JaVlSVJOnr0qLKystSxY0e1bdtWGRkZKisrCzhGaWmp0tPT1bp1ayUkJGjy5Mmqra0NqFmzZo0GDBggp9OpHj16KDc399RmaUNtnFFaN+Uq7XgoTTseStNTN18iSaqurQ9tYwAAhIGgAs6WLVv0xRdfWFteXp4k6ec//7kkaeLEiXr99de1ZMkSrV27Vvv27dONN95ovb6urk7p6emqrq7Whg0bNH/+fOXm5mr69OlWTUlJidLT03XVVVepqKhI2dnZuv3227Vq1armmK+tREY41NYZpbbOKLWL+fbbRgIOAACnuEw8Oztby5Yt0549e+T3+9W5c2ctXLhQN910kyRp9+7d6t27twoKCjRkyBCtWLFC1157rfbt2ye32y1Jmjt3rqZOnar9+/crOjpaU6dO1fLly7Vjxw7rfUaOHKmKigqtXLnypHuz4zLx7/PPPfs16sXNahcTpaE9OgWMxUZH6q6rL1BypzYh6g4AgJMT8mXi1dXVeumllzRmzBg5HA4VFhaqpqZGqampVk2vXr3UrVs3FRQUSJIKCgrUt29fK9xIUlpamvx+v3bu3GnVHHuMhpqGY5xIVVWV/H5/wHY2cbtiJEkHj9ZqxQ5fwPbKe5/rpY2fhrhDAADOnCavonr11Vd
|
||
|
"text/plain": [
|
||
|
"<Figure size 640x480 with 1 Axes>"
|
||
|
]
|
||
|
},
|
||
|
"metadata": {},
|
||
|
"output_type": "display_data"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"df.epub_rank.plot()"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"df['global_rank'] = df.global_frequencies.rank(method='min')"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"text/plain": [
|
||
|
"<Axes: >"
|
||
|
]
|
||
|
},
|
||
|
"execution_count": 78,
|
||
|
"metadata": {},
|
||
|
"output_type": "execute_result"
|
||
|
},
|
||
|
{
|
||
|
"data": {
|
||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAjgAAAGdCAYAAAAfTAk2AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/OQEPoAAAACXBIWXMAAA9hAAAPYQGoP6dpAABfyUlEQVR4nO3de1wU9f4/8BcXWUBcEJVFFBXTVLwLpZtlqSQZp2+lp9OFo5aWPw07qR1NT2amlaYnTctLFxNLzUsnLS+JiLdULoqiiIo3FFMXVIRFlOvO7w9iYmEXdpdddnZ4PR+PfTxg5jOzn8/O7sx7PvO5OAmCIICIiIhIRpztnQEiIiIia2OAQ0RERLLDAIeIiIhkhwEOERERyQ4DHCIiIpIdBjhEREQkOwxwiIiISHYY4BAREZHsuNo7A7ai0+lw/fp1NGnSBE5OTvbODhEREZlAEATk5+cjICAAzs6W18PINsC5fv06AgMD7Z0NIiIissDVq1fRunVri7eXbYDTpEkTAOUfkFKptHNuiIiIyBRarRaBgYHiddxSsg1wKh5LKZVKBjhEREQOpq7NS8x6uFVWVob3338fQUFB8PDwwAMPPIA5c+ag8nydgiBg5syZaNmyJTw8PBAWFobz58/r7ScnJweRkZFQKpXw8fHBmDFjcPfuXb00J0+exGOPPQZ3d3cEBgZi/vz5dSgmERERNSRmBTiffvopli9fji+//BJnzpzBp59+ivnz5+OLL74Q08yfPx9LlizBihUrkJiYiMaNGyM8PByFhYVimsjISKSlpSE2Nhbbtm3DgQMHMHbsWHG9VqvFkCFD0LZtWyQnJ2PBggWYNWsWvv76aysUmYiIiOTOSahc/VKLv/3tb1CpVFi5cqW4bPjw4fDw8MCaNWsgCAICAgLwzjvv4N///jcAIC8vDyqVCtHR0XjppZdw5swZBAcH48iRIwgNDQUA7Ny5E08//TT++OMPBAQEYPny5Xjvvfeg0Wjg5uYGAJg2bRq2bNmCs2fPmpRXrVYLb29v5OXl8REVERGRg7DW9dusGpxHHnkEcXFxOHfuHADgxIkTOHjwIIYOHQoAyMjIgEajQVhYmLiNt7c3+vbti/j4eABAfHw8fHx8xOAGAMLCwuDs7IzExEQxzYABA8TgBgDCw8ORnp6OO3fuGMxbUVERtFqt3ouIiIgaJrMaGU+bNg1arRadO3eGi4sLysrK8PHHHyMyMhIAoNFoAAAqlUpvO5VKJa7TaDTw8/PTz4SrK3x9ffXSBAUFVdtHxbqmTZtWy9vcuXPx4YcfmlMcIiIikimzanA2btyItWvXYt26dTh27BhWr16N//73v1i9erWt8mey6dOnIy8vT3xdvXrV3lkiIiIiOzGrBmfKlCmYNm0aXnrpJQBA9+7dceXKFcydOxejRo2Cv78/ACArKwstW7YUt8vKykKvXr0AAP7+/sjOztbbb2lpKXJycsTt/f39kZWVpZem4v+KNFUpFAooFApzikNEREQyZVYNzr1796oNm+zi4gKdTgcACAoKgr+/P+Li4sT1Wq0WiYmJUKvVAAC1Wo3c3FwkJyeLafbs2QOdToe+ffuKaQ4cOICSkhIxTWxsLDp16mTw8RQRERFRZWYFOM888ww+/vhjbN++HZcvX8bmzZuxcOFCPP/88wDKB+WZOHEiPvroI/z6669ITU3FyJEjERAQgOeeew4A0KVLFzz11FN44403kJSUhEOHDmHChAl46aWXEBAQAAB45ZVX4ObmhjFjxiAtLQ0bNmzA4sWLMXnyZOuWnoiIiORJMINWqxXefvttoU2bNoK7u7vQvn174b333hOKiorENDqdTnj//fcFlUolKBQKYfDgwUJ6errefm7fvi28/PLLgpeXl6BUKoXXXntNyM/P10tz4sQJ4dFHHxUUCoXQqlUrYd68eeZkVcjLyxMACHl5eWZtR0RERPZjreu3WePgOBKOg0NEROR47DIODhEREZEjYIBjZ3n3S6DJK6w9YRXawhLsTc9GSZnOBrmi+pJ46Tau5963dzZsZtPRqzh04ZbN36ekTIe96dnILyypPTEAnU6WFdcmu1dcapP96nQCLmTnQwoPBo5n3sEvKdfsnQ2TXbldgMKSMpu/T5a2sNbrhk4nIPdescHljoQBjp31/HAX+s2NQ05B9S9TTUasTMJrq47gvzHpKCiyzcmqwh937kFby4Xjl5Rr2HRUemMPnbqWh893nzN44liTcAX/3nQCZQZ+tGsSrmDnKY3Bfe5Nz8a/N53A3Ro+97TrebVebI9l3sGLXyfgkXl7aimFZex9kTl9XYspP51E5LeJJm+T+kcexn5/FBdv3q09cSWLd5/Ha6uOYOR3SUbTFJeWn9Sv5txDn49isSi2fET2U9fysPHI1Ro/r8KSMmw9cR1Xbhdg64nrNV6Izmq0WJeYKV4MdqTewBML9uLUtTy9dIIg4D+bU7HyYIbJ5bSGvenZCJ4ZgwUxpk17AwDZ+YUmXXzn7TyLsIUHMD8mHXO2ncaIlYkGf1/14fllh/H2+hScuJpr8T7uFpXi1LU8m/+Wkq/k4PEF+/D0kt/N2i5bW4gD526anL+063no+0kcnl92qMZ0o1cfQa/ZsUj946/v7IwtqQj5KBZ590y7iZACBjhmir94G+2mbUe7adtxPivfavs9qzFvaomKH+1XBy6h6wcx4sm7Jocv3sK2k9fF/5Ov5ODz3edqjOav5d7Ho5/uRc8PdwEALmTn40K2/sWnqLQMb69PwZSfTpodqAHmX4h/SbmGxEu3TUr7ty8O4vPd5/HFnvPV1s3Ycgo/Jf+B3Wf0x1y6kH0XM7acwrg1fw1lcK+4FAt3pSPteh5eW3UEPyX/gW4fxBisfTtw7iYilhxE+KIDNeYt+bLhaUesIe9+CYKm78DQxeadMM0lCAISLt02eLdnSc3UM18exK7TWXh1lfFAxZBNyeXB9fHMXIPr52w7jQdn/IYzN7T476505N4rweK48u/E3744iKn/O4k9Z7MNbgsAc3ecwVs/HsfjC/bhrR+PY86200bTPvX57/jP5lRs+bP24M21x3D59j1ErTumly4pIwfrEjNr3Je5srSFOJ5Z/r0y9rv64Jc0AMDSvRcNrr9fXIZs7V/f66s59/Dwx3EYMH9vre//9YFLAIDl+y5i5cEM/H7+Fg5ftLwG71rufSyIOYssrfm13BUu3y6weNu/Lfkdf/viIOLOGP9uWMOvKeXn5Us3a89r5RunfnPjMPK7JOw6nVXDFsBH206jx6wYrE8q/52cuqatMWDdl34TAPBDwmVx2ZqETNy5V4Kes3fVmkepYIBjppe/SRD/fnLRAaRczcX8nWexN738B5CUkYPkK7a7cBlz625RrWle+SYRE9YdR8at8h/R8OXx+Hz3eaxNuCKm2ZF6A88vO4SrOfcAAEcv5wAABKH8LjZs4QGELdyPotK/fhyV79Aq1yblF5ZgQcxZo8Hb7btFiDuThdCPdmNvDReXys5n5ePt9Sl48euE2hNXcvq68QAyv1C/JqZqkLYj9QaCZ8ZgyZ4LiFhyUG/dh1vTql1IdqTeAABct+DRo7X8fflhAMCZG7adk+3XE9fx0tcJCP+8ejDnXIezy9WcmoOjsxqtSd/5ChW1JBW1Nob3afyG5ZcT1/X+33K89kcfp67pf/ZVLygFlR4TDfzvPtzIM1zmzNv3xN9jbcrvzg9jy/FrePiTOKw+fNmk7fT3sRsPfxIn1shWnNuy803/vCsrraEGp7abm8hvErB070WM/f5ojenyC0uw5fg1kx9Rmury7fLPfevJ67WkNF9F2QVBqHYOMuajbafRfdYu8ZhUfLQHz9ccRH57MAPawlL8UOlcH7ZwvwW5diwMcOrouaWHsGzfRby26gi+O5iBf3wVj+HLD6OkTIeSMh0+2XHG4B1MmU7AuB/+qiGAUH7XDQDzfjuL5fsuQhAElP5Zu3K/uAznrFRjVLXW4WKlu4Y31x7D8cxczNhyqtp2lX+E94r+Olk7wcng+3y8/QyW7r2Ipz6vXotw5oYWIR/txpjVR3G7oBivRR8xKe9/2KC9Sm0
|
||
|
"text/plain": [
|
||
|
"<Figure size 640x480 with 1 Axes>"
|
||
|
]
|
||
|
},
|
||
|
"metadata": {},
|
||
|
"output_type": "display_data"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"df.global_rank.plot()"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"df['rank_diff'] = df.epub_rank - df.global_rank"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"text/plain": [
|
||
|
"<Axes: >"
|
||
|
]
|
||
|
},
|
||
|
"execution_count": 81,
|
||
|
"metadata": {},
|
||
|
"output_type": "execute_result"
|
||
|
},
|
||
|
{
|
||
|
"data": {
|
||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAkQAAAGgCAYAAABL3XhTAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/OQEPoAAAACXBIWXMAAA9hAAAPYQGoP6dpAABeIElEQVR4nO3deVxUVeMG8GdYZgBlhp0BRQQ3RHFDRdxNXtGwoqyfFplbmYal6etWZnualW2WtrypvWkuvWXlGmFqJeK+4IKaqLgAbszgxnp+fxA3BgaYgRlmmHm+n898Yu49c+85zMQ8nnvuOTIhhAARERGRHXOwdAWIiIiILI2BiIiIiOweAxERERHZPQYiIiIisnsMRERERGT3GIiIiIjI7jEQERERkd1jICIiIiK7x0BEREREdo+BiIiIiOyeWQNRcXExXnrpJYSEhMDV1RUtWrTA66+/jvKrhQghMHfuXAQEBMDV1RUxMTE4deqUznGuX7+OhIQEKJVKeHh4YNy4cbh586ZOmcOHD6NPnz5wcXFBUFAQFixYYM6mERERkQ1xMufB3377bSxevBjLly9Hu3btsHfvXowZMwYqlQrPPfccAGDBggX46KOPsHz5coSEhOCll15CbGwsjh07BhcXFwBAQkICLl++jKSkJBQWFmLMmDEYP348Vq5cCQDQarUYNGgQYmJisGTJEhw5cgRjx46Fh4cHxo8fX2M9S0pKcOnSJbi7u0Mmk5nvF0JEREQmI4RAXl4eAgMD4eBQxz4eYUZxcXFi7NixOtseeughkZCQIIQQoqSkRKjVavHOO+9I+3Nzc4VCoRDffvutEEKIY8eOCQBiz549UplNmzYJmUwmLl68KIQQ4tNPPxWenp4iPz9fKjNz5kzRpk0bg+qZmZkpAPDBBx988MEHHw3wkZmZWbugUo5Ze4h69uyJzz//HCdPnkTr1q1x6NAh/PHHH1i4cCEAICMjA1lZWYiJiZFeo1KpEBUVhZSUFIwYMQIpKSnw8PBA165dpTIxMTFwcHBAamoqHnzwQaSkpKBv376Qy+VSmdjYWLz99tu4ceMGPD09deqVn5+P/Px86bn4+xJeZmYmlEqlWX4XREREZFparRZBQUFwd3ev87HMGohmzZoFrVaLsLAwODo6ori4GG+++SYSEhIAAFlZWQAAf39/ndf5+/tL+7KysuDn56dbaScneHl56ZQJCQmpdIyyfRUD0bx58/Dqq69Wqq9SqWQgIiIiamBMMdzFrIOq16xZgxUrVmDlypXYv38/li9fjnfffRfLly8352lrNHv2bGg0GumRmZlp0foQERGRZZm1h2j69OmYNWsWRowYAQCIiIjAuXPnMG/ePIwaNQpqtRoAkJ2djYCAAOl12dnZ6NSpEwBArVYjJydH57hFRUW4fv269Hq1Wo3s7GydMmXPy8qUp1AooFAoTNNIIiIiavDM2kN0+/btSqO+HR0dUVJSAgAICQmBWq1GcnKytF+r1SI1NRXR0dEAgOjoaOTm5mLfvn1Sma1bt6KkpARRUVFSmR07dqCwsFAqk5SUhDZt2lS6XEZERERUkVkD0X333Yc333wTGzZswNmzZ/HDDz9g4cKFePDBBwGUXvObMmUK3njjDfz00084cuQInnjiCQQGBiI+Ph4A0LZtWwwePBhPPfUUdu/ejT///BOTJk3CiBEjEBgYCAB47LHHIJfLMW7cOBw9ehSrV6/Ghx9+iKlTp5qzeURERGQr6nyfWjW0Wq2YPHmyaNasmXBxcRGhoaHixRdf1Lk9vqSkRLz00kvC399fKBQKMXDgQJGenq5znGvXrolHH31UNG7cWCiVSjFmzBiRl5enU+bQoUOid+/eQqFQiCZNmoj58+cbXE+NRiMACI1GU7cGExERUb0x5fe3TIhy00bbKa1WC5VKBY1Gw7vMiIiIGghTfn9zLTMiIiKyewxEREREZPcYiIiIiMjuMRARERGR3WMgIiIiIrvHQERERER2j4GoAVuZeh7DP0tB5vXblq4KERFRg8ZA1EBla+/ihR+OIDXjOu5f9AdOZudhweYT0NwprPnFREREpMOsi7uS+dy4XVDu50IMen8HACBLexcL/6+ThWpFRETUMLGHyMYcuaCptE0Igas38y1QGyIiooaBgcgOzFmXhq5v/IqNRy5buipERERWiYHIDqxIPQ8AeHdLOgAg724hXv35KA6cv2HJahEREVkNBiI7tGBzOpb+eRYPfrrT0lUhIiKyCgxEduhUTp6lq0BERGRVGIjqkRDC0lUgIiIiPRiI6sllzR10ezMZC39J19n+7e7z+GbXOQvVioiIiAAGonrzUfIpXL2Zj4+2npa23S4owuzvj2DOujTklptXyJr8ePAi+i74Dccvay1dFSIiIrNhIKon+q6WFRb9s/FuYYlpzmOSo/xj8qqDOH/9NiavOmDiIxMREVkPBiIySEGRaQIbERGRNWIgqidX8kwzU/TdwmJsOZqFW/nFevfLTHIW4205moUJ/90HzW2upUZERA0P1zKrJ8knckxynBe+P4LvD1xEU09XkxzPVJ7+7z4AgL9SgVcfaG/h2hARERmHPUQNzPcHLgIALty4Y+Ga6HeFa6YREVEDxEBkhzgdEhERkS5eMiODGJuhrt3Mx75zN9CjhTd2nr6GvLuFuHarACN7BOP3U1ehcHZAMy83tPBtbJb6EhERGYOByIKEETFDCAGZzDRDpk10mGrdv+hPXMytfFlvyfa/kFtu4PXZ+XHmrwwREVENeMnMSlQXUtbuzUS3N3/F4Qu59VafutIXhgDohCEiIiJrwR6iBmD6d4cBAJNWWm5yREvdzk9ERFQf2ENkQbJyMcOQgc7FJTUX4nhpIiIi4zEQERERkd1jIGpAhAHdSOa6tGVozxNv6SciooaIgageXL9lmpXsmTWIiIjMg4OqzeznQ5fw7LemGQxtjb0vy3eexWfb/5Ke1/aW/i1HsxCocsWWo1lY9NtpuLs44cneoXi4a1P8ceoKhkQEIEtzFyey8nBfhwCdKQgOX8hFluYuBrVT17U5RERkpxiIzOyNDceq3GfIPEQrU8+bsjo1ytLcxWvrj2JUdHNEhXpL289du623/Ms/Ha3zOY9e0khroZXJu1uE9389ifd/PQkA+PV4DpKOZQMA3BVOGBDmJ5W9f9GfAICk5/uilb97netDRET2x+yXzC5evIjHH38c3t7ecHV1RUREBPbu3SvtF0Jg7ty5CAgIgKurK2JiYnDq1CmdY1y/fh0JCQlQKpXw8PDAuHHjcPPmTZ0yhw8fRp8+feDi4oKgoCAsWLDA3E0zqap6Vl744Yj0szETOVanup6mWd8fxsYjWRj++a5K+7K1d01y/orOXLlVY5myMASUBih9qgptRERENTFrILpx4wZ69eoFZ2dnbNq0CceOHcN7770HT09PqcyCBQvw0UcfYcmSJUhNTUWjRo0QGxuLu3f/+fJNSEjA0aNHkZSUhPXr12PHjh0YP368tF+r1WLQoEEIDg7Gvn378M477+CVV17B559/bs7m1TtDLpmdyrlZc6FqVLdobEFRSZ2OTUREZK3Mesns7bffRlBQEJYuXSptCwkJkX4WQuCDDz7AnDlz8MADDwAAvv76a/j7+2PdunUYMWIEjh8/js2bN2PPnj3o2rUrAODjjz/Gvffei3fffReBgYFYsWIFCgoK8NVXX0Eul6Ndu3Y4ePAgFi5cqBOcqFR143w4ASMREdkjs/YQ/fTTT+jatSseeeQR+Pn5oXPnzvjiiy+k/RkZGcjKykJMTIy0TaVSISoqCikpKQCAlJQUeHh4SGEIAGJiYuDg4IDU1FSpTN++fSGXy6UysbGxSE9Px40bNyrVKz8/H1qtVufRENTHmOr6WOesrky1phsREVEZswaiM2fOYPHixWjVqhW2bNmCiRMn4rnnnsPy5csBAFlZWQAAf39/ndf5+/tL+7KysuDn56ez38nJCV5eXjpl9B2j/DnKmzdvHlQqlfQICgoyQWuNZ+xM1fVxl5lDubBxqYr1yIiIiGyNWQNRSUkJunTpgrfeegudO3fG+PHj8dRTT2HJkiX
|
||
|
"text/plain": [
|
||
|
"<Figure size 640x480 with 1 Axes>"
|
||
|
]
|
||
|
},
|
||
|
"metadata": {},
|
||
|
"output_type": "display_data"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"df.rank_diff.plot()"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"df['epub_rank_is_greater'] = df.epub_rank > df.global_rank"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"text/plain": [
|
||
|
"<Axes: >"
|
||
|
]
|
||
|
},
|
||
|
"execution_count": 84,
|
||
|
"metadata": {},
|
||
|
"output_type": "execute_result"
|
||
|
},
|
||
|
{
|
||
|
"data": {
|
||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAisAAAGdCAYAAADT1TPdAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/OQEPoAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAj7klEQVR4nO3df3BU1QG38e8mkE0oJEFjNiQGAoKgAgETSVe06uvWFBksre2kSAHx12CjBWMVokKkVkJtpViNUFHUGUVQX6VWMBQjYKkRSiAKiqAChlfdBErJhoAJZM/7h8OaJYmySNjD5vnM7Ey5e+7ec2bb5OnN3rsOY4wRAACApaLCPQEAAIBvQ6wAAACrESsAAMBqxAoAALAasQIAAKxGrAAAAKsRKwAAwGrECgAAsFqncE/gePj9fn3xxRfq1q2bHA5HuKcDAACOgzFGdXV1Sk1NVVTUiZ8fOS1i5YsvvlB6enq4pwEAAE7A7t27dfbZZ5/w/qdFrHTr1k3S14uNj48P82wAAMDx8Pl8Sk9PD/weP1GnRawc/dNPfHw8sQIAwGnm+36Egw/YAgAAqxErAADAasQKAACwGrECAACsRqwAAACrESsAAMBqxAoAALAasQIAAKxGrAAAAKuFHCtvv/22Ro0apdTUVDkcDi1duvQ791m9erUuvPBCOZ1O9e3bV88888wJTBUAAHREIcdKfX29MjMzVVJSclzjd+7cqZEjR+qKK65QZWWlpkyZoptuukkrVqwIebIAAKDjCfm7gUaMGKERI0Yc9/j58+erd+/eevjhhyVJ5513ntauXau//OUvys3NDfXwAACgg2n3LzIsLy+Xx+MJ2pabm6spU6a0uU9DQ4MaGhoC//b5fO0yt6fW7tT/+9/BwL+/OuzXC+urWozrEhOtg41N7TIHwAYfPfATxXaODvc0AKBV7R4rXq9XLpcraJvL5ZLP59OhQ4cUFxfXYp/i4mLNnDmzvaemZe9/oY1V+79zHKGCSDft/76vub8aGu5pAECrrLwaqLCwULW1tYHH7t272+U412adrfwrzgk8gI5qaeUX4Z4CALSp3c+spKSkqLq6OmhbdXW14uPjWz2rIklOp1NOp7O9p6axOb2C/r1oXZX+d/Bwux8XAAAcv3Y/s+J2u1VWVha0beXKlXK73e19aAAAEAFCjpUDBw6osrJSlZWVkr6+NLmyslJVVV9/MLWwsFDjx48PjJ80aZJ27Nihu+++Wx999JEef/xxvfjii7rjjjtOzgoAAEBECzlWNmzYoKFDh2ro0K8/jFdQUKChQ4dqxowZkqQvv/wyEC6S1Lt3by1btkwrV65UZmamHn74YT355JNctgwAAI5LyJ9Zufzyy2WMafP51u5Oe/nll2vTpk2hHgoAAMDOq4EAAACOIlYAAIDViBUAAGA1YgUAAFiNWGmm7Y8NAwCAcCFWAACA1YgVAABgNWIFAABYjVgBAABWI1YAAIDViBUAAGA1YgUAAFiNWAEAAFYjVgAAgNWIFQAAYDVipRnD/fYBALAOsQIAAKxGrDTjcIR7BgAA4FjECgAAsBqxAgAArEasAAAAqxErAADAasQKAACwGrECAACsRqwAAACrESsAAMBqxAoAALAasdIM3w0EAIB9iBUAAGA1YgUAAFiNWAEAAFYjVgAAgNWIFQAAYDViBQAAWI1YAQAAViNWAACA1YgVAABgNWIFAABYjVhpxnC/fQAArEOsAAAAqxErzTgcjnBPAQAAHINYAQAAViNWAACA1YgVAABgNWIFAABYjVgBAABWI1YAAIDViBUAAGA1YgUAAFiNWGmG2+0DAGAfYgUAAFiNWAEAAFYjVgAAgNWIFQAAYDViBQAAWI1YAQAAViNWAACA1U4oVkpKSpSRkaHY2Fjl5ORo/fr13zp+7ty56t+/v+Li4pSenq477rhDX3311QlNGAAAdCwhx8qSJUtUUFCgoqIibdy4UZmZmcrNzVVNTU2r4xctWqRp06apqKhIW7du1VNPPaUlS5bonnvu+d6TBwAAkS/kWJkzZ45uvvlmTZw4Ueeff77mz5+vLl26aOHCha2Of+eddzR8+HBdd911ysjI0FVXXaUxY8Z859kYAAAAKcRYaWxsVEVFhTwezzcvEBUlj8ej8vLyVve5+OKLVVFREYiTHTt2aPny5br66qvbPE5DQ4N8Pl/Q41TgZvsAANinUyiD9+7dq6amJrlcrqDtLpdLH330Uav7XHfdddq7d68uueQSGWN05MgRTZo06Vv/DFRcXKyZM2eGMjUAABCh2v1qoNWrV2vWrFl6/PHHtXHjRr3yyitatmyZHnjggTb3KSwsVG1tbeCxe/fu9p6mJMlxSo4CAABCEdKZlaSkJEVHR6u6ujpoe3V1tVJSUlrdZ/r06Ro3bpxuuukmSdKgQYNUX1+vW265Rffee6+iolr2ktPplNPpDGVqAAAgQoV0ZiUmJkZZWVkqKysLbPP7/SorK5Pb7W51n4MHD7YIkujoaEmSMXxKBAAAfLuQzqxIUkFBgSZMmKDs7GwNGzZMc+fOVX19vSZOnChJGj9+vNLS0lRcXCxJGjVqlObMmaOhQ4cqJydHn3zyiaZPn65Ro0YFogUAAKAtIcdKXl6e9uzZoxkzZsjr9WrIkCEqLS0NfOi2qqoq6EzKfffdJ4fDofvuu0+ff/65zjrrLI0aNUoPPvjgyVsFAACIWA5zGvwtxufzKSEhQbW1tYqPj2+34wy+f4V8Xx1pt9cHbLZr9shwTwFAhDlZv7/5biAAAGA1YgUAAFiNWAEAAFYjVpqx/sM7AAB0QMQKAACwGrECAACsRqwAAACrESsAAMBqxAoAALAasQIAAKxGrAAAAKsRKwAAwGrECgAAsBqxAgAArEasAAAAqxErAADAasQKAACwGrECAACsRqwAAACrESsAAMBqxAoAALAasQIAAKxGrAAAAKsRKwAAwGrECgAAsBqxAgAArEasNGfCPQEAAHAsYgUAAFiNWAEAAFYjVgAAgNWIFQAAYDViBQAAWI1YAQAAViNWAACA1YgVAABgNWIFAABYjVgBAABWI1aa4W77AADYh1gBAABWI1aacYR7AgAAoAViBQAAWI1YAQAAViNWAACA1YgVAABgNWIFAABYjVgBAABWI1YAAIDViBUAAGA1YqUZbrcPAIB9iBUAAGA1YgUAAFiNWAEAAFYjVgAAgNWIFQAAYDViBQAAWI1YAQAAVjuhWCkpKVFGRoZiY2OVk5Oj9evXf+v4/fv3Kz8/Xz169JDT6dS5556r5cuXn9CEAQBAx9Ip1B2WLFmigoICzZ8/Xzk5OZo7d65yc3O1bds2JScntxjf2NioH//4x0pOTtbLL7+stLQ0ffbZZ0pMTDwZ8wcAABEu5FiZM2eObr75Zk2cOFGSNH/+fC1btkwLFy7UtGnTWoxfuHCh9u3bp3feeUedO3eWJGVkZHy/WQMAgA4jpD8DNTY2qqKiQh6P55sXiIqSx+NReXl5q/u89tprcrvdys/Pl8vl0sCBAzVr1iw1NTW1eZyGhgb5fL6gx6lgDDfcBwDANiHFyt69e9XU1CSXyxW03eVyyev1trrPjh079PLLL6upqUnLly/X9OnT9fDDD+sPf/hDm8cpLi5WQkJC4JGenh7KNAEAQARp96uB/H6/kpOT9cQTTygrK0t5eXm69957NX/+/Db3KSwsVG1tbeCxe/fu9p6mJMnhcJyS4wAAgOMX0mdWkpKSFB0drerq6qDt1dXVSklJaXWfHj16qHPnzoqOjg5sO++88+T1etXY2KiYmJgW+zidTjmdzlCmBgAAIlRIZ1ZiYmKUlZWlsrKywDa/36+ysjK53e5W9xk+fLg++eQT+f3+wLbt27erR48erYYKAABAcyH/GaigoEALFizQs88+q61bt+rWW29VfX194Oqg8ePHq7CwMDD+1ltv1b59+zR58mRt375dy5Yt06xZs5Sfn3/yVgEAACJWyJcu5+Xlac+ePZoxY4a8Xq+GDBmi0tLSwIduq6qqFBX1TQOlp6drxYoVuuOOOzR48GClpaVp8uTJmjp16slbBQAAiFgOcxp
|
||
|
"text/plain": [
|
||
|
"<Figure size 640x480 with 1 Axes>"
|
||
|
]
|
||
|
},
|
||
|
"metadata": {},
|
||
|
"output_type": "display_data"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"(df.epub_rank_is_greater * 1).plot(style=dict( linestyle='--ob'))"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"metadata": {
|
||
|
"kernelspec": {
|
||
|
"display_name": "genanki",
|
||
|
"language": "python",
|
||
|
"name": "python3"
|
||
|
},
|
||
|
"language_info": {
|
||
|
"codemirror_mode": {
|
||
|
"name": "ipython",
|
||
|
"version": 3
|
||
|
},
|
||
|
"file_extension": ".py",
|
||
|
"mimetype": "text/x-python",
|
||
|
"name": "python",
|
||
|
"nbconvert_exporter": "python",
|
||
|
"pygments_lexer": "ipython3",
|
||
|
"version": "3.8.12"
|
||
|
},
|
||
|
"orig_nbformat": 4,
|
||
|
"vscode": {
|
||
|
"interpreter": {
|
||
|
"hash": "45165aaad12be9d54ac15a8bf37c1f60b988e47dbfa6601f9f69e1d36951ad3e"
|
||
|
}
|
||
|
}
|
||
|
},
|
||
|
"nbformat": 4,
|
||
|
"nbformat_minor": 2
|
||
|
}
|