diff --git a/wfsr/.ipynb_checkpoints/data-checkpoint.ipynb b/wfsr/.ipynb_checkpoints/data-checkpoint.ipynb deleted file mode 100644 index fe90c7e354ac30b1647ceee2ff03b40c28a0c934..0000000000000000000000000000000000000000 --- a/wfsr/.ipynb_checkpoints/data-checkpoint.ipynb +++ /dev/null @@ -1,123 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "# imports\n", - "\n", - "import pandas as pd\n", - "from xlrd import XLRDError\n", - "from pkg_resources import resource_filename" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "collapsed": true, - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "importing Jupyter notebook from C:\\Anaconda3\\envs\\py36\\lib\\site-packages\\rikilt\\data.ipynb\n", - "importing Jupyter notebook from C:\\Anaconda3\\envs\\py36\\lib\\site-packages\\rikilt\\tools.ipynb\n", - "importing Jupyter notebook from C:\\Anaconda3\\envs\\py36\\lib\\site-packages\\rikilt\\elastic.ipynb\n" - ] - } - ], - "source": [ - "# data\n", - "\n", - "data_file = resource_filename('wfsr', 'datafiles/data.xlsx')\n" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# The country table requires some modification\n", - "\n", - "_countries_converters = {\n", - " 'alpha2': str,\n", - " 'eurostat_alpha2': str,\n", - " 'alpha3': str,\n", - " 'numeric': str,\n", - "}\n", - "\n", - "def _load_countries():\n", - " df = pd.read_excel(\n", - " data_file, \n", - " sheet_name='countries', \n", - " converters=_countries_converters,\n", - " keep_default_na=False) \\\n", - " .fillna('')\n", - " df['synonyms'] = df['synonyms'].str.split('|')\n", - " return df" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# Requesting a table returns a newly made copy each time\n", - "\n", - "def load(table_name=''):\n", - " if table_name == 'feed_ontology':\n", - " # Return the feed master table from another package\n", - " import feed_conversion\n", - " return feed_conversion.tools.data\n", - " elif table_name == 'countries':\n", - " # Requires some pretreatment\n", - " return _load_countries()\n", - " try:\n", - " return pd.read_excel(\n", - " data_file, \n", - " sheet_name=table_name, \n", - " keep_default_na=False\n", - " ).fillna('')\n", - " except XLRDError:\n", - " # If a non-existent table was requested, return a list of options.\n", - " table_dict = pd.read_excel(data_file, sheet_name=None)\n", - " additional = list(table_dict.keys()) + ['feed_ontology']\n", - " index = sorted(additional)\n", - " print_index = \"\\n\".join(index)\n", - " print(f\"Table '{table_name}' not available. Try:\\n{print_index}\")\n" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "py36", - "language": "python", - "name": "py36" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.5" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/wfsr/.ipynb_checkpoints/tools-checkpoint.ipynb b/wfsr/.ipynb_checkpoints/tools-checkpoint.ipynb index 65deae868c3900a124b5bc79428dfbd159bc1a60..113f402eeba91bf40e0b257a1591c408267731e3 100644 --- a/wfsr/.ipynb_checkpoints/tools-checkpoint.ipynb +++ b/wfsr/.ipynb_checkpoints/tools-checkpoint.ipynb @@ -181,127 +181,6 @@ " return text\n" ] }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "# reference data\n", - "\n", - "_countries = wfsr.data.load('countries')\n", - "_countries['country_caseless'] = _countries['country'].apply(lambda x: normalize_text(x).casefold() )\n" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": {}, - "outputs": [], - "source": [ - "_shape_file = resource_filename('wfsr', 'datafiles/ne_50m_admin_0_countries')\n", - "_shape_reader = shapefile.Reader(_shape_file)\n", - "_shape_keys = [ x[0] for x in _shape_reader.fields ]\n", - "\n", - "_shape_countries = []\n", - "for country in _shape_reader.shapeRecords():\n", - " rec = {k: v for k, v in zip(_shape_keys, country.record)}\n", - " shp = shape(country.shape)\n", - " iso = rec['ISO_A3']\n", - " if iso != '-99':\n", - " _shape_countries.append((iso, rec, shp))\n", - "\n", - "def find_country_from_coordinates(lat, lon):\n", - " try:\n", - " latitude = float(lat)\n", - " longitude = float(lon)\n", - " except TypeError:\n", - " print(\"\"\"Please make sure lat and lon are numeric.\n", - " lat: {str(lat)}\n", - " lon: {str(lon)}\"\"\")\n", - " p = Point(longitude, latitude) # the order matters\n", - " for iso, rec, shp in _shape_countries:\n", - " if shp.contains(p):\n", - " return find_country(iso)\n", - " return find_country('99')\n", - "\n", - "def test_find_country_from_coordinates():\n", - " assert find_country_from_coordinates(52, 5).alpha3 == 'NLD'\n" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": {}, - "outputs": [], - "source": [ - "# Country finder \n", - "\n", - "def find_country(string, verbose=True):\n", - " try:\n", - " # make sure input is a string, all caps\n", - " string = normalize_text(str(string).strip()).casefold()\n", - " # Make sure there is enough data for a positive identification\n", - " assert len(string) > 1\n", - " except (AttributeError, AssertionError) as e:\n", - " # if this fails, return the country equivalent of None\n", - " return find_country('99')\n", - "\n", - " try:\n", - " # use a library to catch the most common cases\n", - " hit = pycountry.countries.lookup(string)\n", - " string = hit.alpha_2.casefold()\n", - " except LookupError:\n", - " pass\n", - "\n", - " for idx, row in _countries.iterrows():\n", - " for value in row.values:\n", - " value = normalize_text(str(value).strip()).casefold()\n", - " if string == value:\n", - " return row\n", - " if string in row['synonyms']:\n", - " return row\n", - "\n", - " # print failures unless verbose=False\n", - " if verbose:\n", - " print(string)\n", - " return find_country('99')\n", - "\n", - "def bulk_find_country(l, field='alpha2', verbose=False):\n", - " \"\"\"Translate a long list/pd.Series of country names.\n", - " All unique values are converted once to a dictionary, \n", - " then the list/pd.Series is translated using the \n", - " dictionary and returned.\n", - " \"\"\"\n", - " assert isinstance(l, (list, pd.Series))\n", - " assert field in _countries.columns\n", - " if isinstance(l, list):\n", - " unique = set(l)\n", - " conversion_dict = { u: find_country(u, verbose=verbose) for u in unique }\n", - " converted = [ conversion_dict.get(country) for country in l ]\n", - " get_field = [ getattr(country, field) for country in converted ]\n", - " return get_field\n", - " elif isinstance(l, pd.Series):\n", - " unique = l.unique()\n", - " conversion_dict = { u: find_country(u, verbose=verbose) for u in unique }\n", - " converted = [ conversion_dict.get(country) for country in l ]\n", - " get_field = [ getattr(country, field) for country in converted ]\n", - " return pd.Series(get_field, index=l.index)" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "metadata": {}, - "outputs": [], - "source": [ - "# find_country(\"DEMOCRATIC REPUBLIC OF THE CONGO\")\n", - "# find_country('China, Hong Kong SAR')\n", - "# import pandas as pd\n", - "# s = pd.Series(['NLD', 'BEL'], index=[1,2])\n", - "# bulk_find_country(s, field='alpha2')" - ] - }, { "cell_type": "code", "execution_count": null, @@ -469,7 +348,7 @@ " if meta=True, a JSON string of the translation object.\n", " \"\"\"\n", " try:\n", - " response = googletrans.Translator().translate(string, dest=to_lang)\n", + " response = googletrans.Translator(service_urls=['translate.googleapis.com']).translate(string, dest=to_lang)\n", " if meta:\n", " return json.dumps(response)\n", " else:\n", @@ -582,7 +461,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.3" + "version": "3.6.13" } }, "nbformat": 4, diff --git a/wfsr/__init__.py b/wfsr/__init__.py index 3a47dffffb5532794be02a059d2649562a8b785d..4c2918b10147959fbb65ba4dbbc1b8aba9a42a0f 100644 --- a/wfsr/__init__.py +++ b/wfsr/__init__.py @@ -1,3 +1,3 @@ import import_ipynb -from . import data, tools, elastic -#from . import tools, elastic +#from . import data, tools, elastic +from . import tools, elastic diff --git a/wfsr/data.ipynb b/wfsr/data.ipynb deleted file mode 100644 index 83d06f878fa3ff9706bc4eed0ebffc6e0b791b85..0000000000000000000000000000000000000000 --- a/wfsr/data.ipynb +++ /dev/null @@ -1,126 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "# imports\n", - "\n", - "import pandas as pd\n", - "import openpyxl\n", - "from xlrd import XLRDError\n", - "from pkg_resources import resource_filename" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "collapsed": true, - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "importing Jupyter notebook from C:\\Anaconda3\\envs\\py36\\lib\\site-packages\\rikilt\\data.ipynb\n", - "importing Jupyter notebook from C:\\Anaconda3\\envs\\py36\\lib\\site-packages\\rikilt\\tools.ipynb\n", - "importing Jupyter notebook from C:\\Anaconda3\\envs\\py36\\lib\\site-packages\\rikilt\\elastic.ipynb\n" - ] - } - ], - "source": [ - "# data\n", - "\n", - "data_file = resource_filename('wfsr', 'datafiles/data.xlsx')\n" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# The country table requires some modification\n", - "\n", - "_countries_converters = {\n", - " 'alpha2': str,\n", - " 'eurostat_alpha2': str,\n", - " 'alpha3': str,\n", - " 'numeric': str,\n", - "}\n", - "\n", - "def _load_countries():\n", - " df = pd.read_excel(\n", - " data_file, \n", - " sheet_name='countries', \n", - " converters=_countries_converters,\n", - " engine='openpyxl',\n", - " keep_default_na=False) \\\n", - " .fillna('')\n", - " df['synonyms'] = df['synonyms'].str.split('|')\n", - " return df" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# Requesting a table returns a newly made copy each time\n", - "\n", - "def load(table_name=''):\n", - " if table_name == 'feed_ontology':\n", - " # Return the feed master table from another package\n", - " import feed_conversion\n", - " return feed_conversion.tools.data\n", - " elif table_name == 'countries':\n", - " # Requires some pretreatment\n", - " return _load_countries()\n", - " try:\n", - " return pd.read_excel(\n", - " data_file, \n", - " sheet_name=table_name,\n", - " engine='openpyxl',\n", - " keep_default_na=False\n", - " ).fillna('')\n", - " except XLRDError:\n", - " # If a non-existent table was requested, return a list of options.\n", - " table_dict = pd.read_excel(data_file, sheet_name=None, engine='openpyxl')\n", - " additional = list(table_dict.keys()) + ['feed_ontology']\n", - " index = sorted(additional)\n", - " print_index = \"\\n\".join(index)\n", - " print(f\"Table '{table_name}' not available. Try:\\n{print_index}\")\n" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.3" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/wfsr/feed.ipynb b/wfsr/feed.ipynb deleted file mode 100644 index 9e7dce482b1f9cb9a8e54a8aec92d8eedcced54e..0000000000000000000000000000000000000000 --- a/wfsr/feed.ipynb +++ /dev/null @@ -1,163 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "\"\"\"Conversion methods for feed conversion app.\n", - "\n", - "By: Lennert van Overbeeke\"\"\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# imports\n", - "\n", - "# custom imports\n", - "import wfsr\n", - "\n", - "# common imports\n", - "import os\n", - "import pandas as pd\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# objects\n", - "\n", - "df = wfsr.data.load('feed_ontology')\n", - "riskfeed_groups = wfsr.data.load('conversion_old')\n", - "\n", - "# derived values\n", - "feed_columns = sorted(list(df.columns), key=lambda x: x.lower())\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# foodex methods\n", - "\n", - "def foodex_parent(child_code):\n", - " \"\"\"Return parent FOODEX2 code from the table, or None.\"\"\"\n", - " if child_code in df['FOODEX2_code'].values:\n", - " results = df[df['FOODEX2_code'] == child_code]\n", - " parent_code = results['FOODEX2_parent'].values[0]\n", - " else:\n", - " parent_code = 'None'\n", - " return parent_code\n", - " \n", - " \n", - "def foodex_children(parent_code):\n", - " \"\"\"Return a list of FOODEX2 items that have parent_code as parent.\"\"\"\n", - " results = df[df['FOODEX2_parent'] == parent_code]\n", - " return list(results['FOODEX2_code'])\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# column translation\n", - "\n", - "def get_column_names():\n", - " return sorted(list(df.columns), key=lambda x: x.lower())\n", - "\n", - "def translate_columns(query, A, B):\n", - " \"\"\"For each line in q that matches a value in column A, \n", - " return the corresponding value from column B.\"\"\"\n", - " assert isinstance(query, str)\n", - " assert set([A, B]).issubset(df.columns)\n", - " query = query.split('\\n')\n", - " rows = [ row.strip() for row in query ]\n", - " results = []\n", - " search_space = df[[A,B]].astype(str)\n", - " for row in rows:\n", - " result = ''\n", - " if row:\n", - " result = search_space[search_space[A] == row]\n", - " result = [ v for v in result[B].values if v ]\n", - " result = \"|\".join(result)\n", - " results.append((row, result, ))\n", - " return results\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# RiskFeed tools\n", - "\n", - "def trendgroup_to_crop(query):\n", - " \"\"\"Returns the crop plant type for a trend group.\"\"\"\n", - " result = riskfeed_groups[riskfeed_groups.trend_group.str.lower() \\\n", - " == query.lower()]\n", - " if len(result):\n", - " return result.iloc[0].crop\n", - " else:\n", - " return 'NA'\n", - "\n", - " \n", - "def get_riskfeed_id(query):\n", - " \"\"\"Returns the index number of the RiskFeed ingredient \n", - " based on case insensitive full string match.\"\"\"\n", - " try:\n", - " return riskfeed_groups[riskfeed_groups['NameEN'].str.lower() \\\n", - " == query.lower()].iloc[0].ID\n", - " except IndexError:\n", - " try: \n", - " return riskfeed_groups[riskfeed_groups['NameNL'].str.lower() \\\n", - " == query.lower()].iloc[0].ID\n", - " except IndexError:\n", - " return pd.np.nan\n" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "py36", - "language": "python", - "name": "py36" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.5" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/wfsr/tools.ipynb b/wfsr/tools.ipynb index 86548ddfdf5cfa59f0ba6a42ea90d650c1db731a..113f402eeba91bf40e0b257a1591c408267731e3 100644 --- a/wfsr/tools.ipynb +++ b/wfsr/tools.ipynb @@ -181,127 +181,6 @@ " return text\n" ] }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "# reference data\n", - "\n", - "_countries = wfsr.data.load('countries')\n", - "_countries['country_caseless'] = _countries['country'].apply(lambda x: normalize_text(x).casefold() )\n" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": {}, - "outputs": [], - "source": [ - "_shape_file = resource_filename('wfsr', 'datafiles/ne_50m_admin_0_countries')\n", - "_shape_reader = shapefile.Reader(_shape_file)\n", - "_shape_keys = [ x[0] for x in _shape_reader.fields ]\n", - "\n", - "_shape_countries = []\n", - "for country in _shape_reader.shapeRecords():\n", - " rec = {k: v for k, v in zip(_shape_keys, country.record)}\n", - " shp = shape(country.shape)\n", - " iso = rec['ISO_A3']\n", - " if iso != '-99':\n", - " _shape_countries.append((iso, rec, shp))\n", - "\n", - "def find_country_from_coordinates(lat, lon):\n", - " try:\n", - " latitude = float(lat)\n", - " longitude = float(lon)\n", - " except TypeError:\n", - " print(\"\"\"Please make sure lat and lon are numeric.\n", - " lat: {str(lat)}\n", - " lon: {str(lon)}\"\"\")\n", - " p = Point(longitude, latitude) # the order matters\n", - " for iso, rec, shp in _shape_countries:\n", - " if shp.contains(p):\n", - " return find_country(iso)\n", - " return find_country('99')\n", - "\n", - "def test_find_country_from_coordinates():\n", - " assert find_country_from_coordinates(52, 5).alpha3 == 'NLD'\n" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": {}, - "outputs": [], - "source": [ - "# Country finder \n", - "\n", - "def find_country(string, verbose=True):\n", - " try:\n", - " # make sure input is a string, all caps\n", - " string = normalize_text(str(string).strip()).casefold()\n", - " # Make sure there is enough data for a positive identification\n", - " assert len(string) > 1\n", - " except (AttributeError, AssertionError) as e:\n", - " # if this fails, return the country equivalent of None\n", - " return find_country('99')\n", - "\n", - " try:\n", - " # use a library to catch the most common cases\n", - " hit = pycountry.countries.lookup(string)\n", - " string = hit.alpha_2.casefold()\n", - " except LookupError:\n", - " pass\n", - "\n", - " for idx, row in _countries.iterrows():\n", - " for value in row.values:\n", - " value = normalize_text(str(value).strip()).casefold()\n", - " if string == value:\n", - " return row\n", - " if string in row['synonyms']:\n", - " return row\n", - "\n", - " # print failures unless verbose=False\n", - " if verbose:\n", - " print(string)\n", - " return find_country('99')\n", - "\n", - "def bulk_find_country(l, field='alpha2', verbose=False):\n", - " \"\"\"Translate a long list/pd.Series of country names.\n", - " All unique values are converted once to a dictionary, \n", - " then the list/pd.Series is translated using the \n", - " dictionary and returned.\n", - " \"\"\"\n", - " assert isinstance(l, (list, pd.Series))\n", - " assert field in _countries.columns\n", - " if isinstance(l, list):\n", - " unique = set(l)\n", - " conversion_dict = { u: find_country(u, verbose=verbose) for u in unique }\n", - " converted = [ conversion_dict.get(country) for country in l ]\n", - " get_field = [ getattr(country, field) for country in converted ]\n", - " return get_field\n", - " elif isinstance(l, pd.Series):\n", - " unique = l.unique()\n", - " conversion_dict = { u: find_country(u, verbose=verbose) for u in unique }\n", - " converted = [ conversion_dict.get(country) for country in l ]\n", - " get_field = [ getattr(country, field) for country in converted ]\n", - " return pd.Series(get_field, index=l.index)" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "metadata": {}, - "outputs": [], - "source": [ - "# find_country(\"DEMOCRATIC REPUBLIC OF THE CONGO\")\n", - "# find_country('China, Hong Kong SAR')\n", - "# import pandas as pd\n", - "# s = pd.Series(['NLD', 'BEL'], index=[1,2])\n", - "# bulk_find_country(s, field='alpha2')" - ] - }, { "cell_type": "code", "execution_count": null,