diff --git a/wfsr/.ipynb_checkpoints/tools-checkpoint.ipynb b/wfsr/.ipynb_checkpoints/tools-checkpoint.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..65deae868c3900a124b5bc79428dfbd159bc1a60 --- /dev/null +++ b/wfsr/.ipynb_checkpoints/tools-checkpoint.ipynb @@ -0,0 +1,590 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"Tools for WFSR.\n", + "\n", + "By: Lennert van Overbeeke\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "importing Jupyter notebook from C:\\Users\\overb015\\.conda\\envs\\general\\lib\\site-packages\\wfsr\\data.ipynb\n", + "importing Jupyter notebook from C:\\Users\\overb015\\.conda\\envs\\general\\lib\\site-packages\\wfsr\\tools.ipynb\n", + "importing Jupyter notebook from C:\\Users\\overb015\\.conda\\envs\\general\\lib\\site-packages\\wfsr\\elastic.ipynb\n" + ] + } + ], + "source": [ + "# imports\n", + "\n", + "# custom imports\n", + "import wfsr\n", + "\n", + "# basics\n", + "import os\n", + "import logging\n", + "from time import sleep\n", + "from datetime import datetime, timedelta\n", + "\n", + "# web\n", + "import html\n", + "import signal\n", + "import unicodedata\n", + "import googletrans\n", + "from selenium import webdriver\n", + "\n", + "# data\n", + "import json\n", + "import hashlib\n", + "import openpyxl\n", + "import pandas as pd\n", + "from multiprocessing import Pool\n", + "from xlrd import XLRDError\n", + "from pkg_resources import resource_filename\n", + "from openpyxl.utils import get_column_letter\n", + "\n", + "# geo\n", + "import pycountry\n", + "import shapefile\n", + "from shapely.geometry import Point # Point class\n", + "from shapely.geometry import shape # shape() is a function to convert geo objects through the interface\n", + "# shapely is installed by pip from a .whl file\n", + "# make sure to include it in the package data" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# Time\n", + "\n", + "def parallel(function, iterable, threads=5):\n", + " \"\"\"Execute a function with arguments from an iterable.\n", + " Parallellize the execution with n=5 number of threads.\"\"\"\n", + " assert isinstance(threads, int)\n", + " with Pool(threads) as p:\n", + " return(p.map(function, iterable))\n", + " \n", + "class Scheduler():\n", + " def __init__(self, hours=1, minutes=0):\n", + " self.hours = hours\n", + " self.minutes = minutes\n", + " \n", + " def get_target(self):\n", + " now = datetime.now()\n", + " target = now.replace(\n", + " hour = (now.hour + self.hours) % 24,\n", + " minute = (now.minute + self.minutes) % 60,\n", + " second = 0,\n", + " microsecond = 0\n", + " )\n", + " return target\n", + " \n", + " def wait(self):\n", + " target = self.get_target()\n", + " while target > datetime.now():\n", + " sleep(1)\n", + " \n", + " def start(self, function, *args):\n", + " while True:\n", + " self.wait()\n", + " function(*args)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# Web scraping tools\n", + "\n", + "def get_phantomjs_driver():\n", + " \"\"\"Use PhantomJS to load JavaScript-generated pages.\"\"\"\n", + " linux_driver = resource_filename('wfsr', 'datafiles/phantomjs')\n", + " windows_driver = resource_filename('wfsr', 'datafiles/phantomjs.exe')\n", + " try:\n", + " driver = webdriver.PhantomJS(linux_driver)\n", + " except OSError:\n", + " driver = webdriver.PhantomJS(windows_driver) \n", + " return driver\n", + "\n", + "def get_chrome_driver():\n", + " \"\"\"WORK IN PROGRESS\"\"\"\n", + " linux_driver = resource_filename('wfsr', 'datafiles/chromedriver') \n", + " windows_driver = resource_filename('wfsr', 'datafiles/chromedriver.exe')\n", + " chrome_options = webdriver.ChromeOptions()\n", + " chrome_options.add_argument('--no-sandbox')\n", + " chrome_options.add_argument('--headless')\n", + " chrome_options.add_argument('--disable-gpu')\n", + " try:\n", + " driver = webdriver.Chrome(linux_driver, chrome_options=chrome_options)\n", + " except OSError:\n", + " driver = webdriver.Chrome(windows_driver, chrome_options=chrome_options)\n", + " driver.implicitly_wait(10)\n", + " \n", + " return driver\n", + "\n", + "def get_html(url):\n", + " \"\"\"Fetch page HTML for a given url\"\"\"\n", + " driver = get_phantomjs_driver()\n", + " # fetch the website contents, which takes a few seconds\n", + " driver.get(url)\n", + " # get pure HTML\n", + " raw_html = driver.page_source\n", + " driver.service.process.send_signal(signal.SIGTERM) # kill the specific phantomjs child proc\n", + " driver.quit() \n", + " return raw_html" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "def get_md5(s):\n", + " return hashlib.md5(s.encode()).hexdigest()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# normalize text\n", + "\n", + "def normalize_text(text):\n", + " # Compose modified chars into one char \n", + " text = unicodedata.normalize('NFKC', text)\n", + " # Translate HTML escaped chars back to chars\n", + " text = html.unescape(text)\n", + " # repeat to fix nested html chars\n", + " text = html.unescape(text)\n", + " text = html.unescape(text)\n", + " return text\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "# reference data\n", + "\n", + "_countries = wfsr.data.load('countries')\n", + "_countries['country_caseless'] = _countries['country'].apply(lambda x: normalize_text(x).casefold() )\n" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "_shape_file = resource_filename('wfsr', 'datafiles/ne_50m_admin_0_countries')\n", + "_shape_reader = shapefile.Reader(_shape_file)\n", + "_shape_keys = [ x[0] for x in _shape_reader.fields ]\n", + "\n", + "_shape_countries = []\n", + "for country in _shape_reader.shapeRecords():\n", + " rec = {k: v for k, v in zip(_shape_keys, country.record)}\n", + " shp = shape(country.shape)\n", + " iso = rec['ISO_A3']\n", + " if iso != '-99':\n", + " _shape_countries.append((iso, rec, shp))\n", + "\n", + "def find_country_from_coordinates(lat, lon):\n", + " try:\n", + " latitude = float(lat)\n", + " longitude = float(lon)\n", + " except TypeError:\n", + " print(\"\"\"Please make sure lat and lon are numeric.\n", + " lat: {str(lat)}\n", + " lon: {str(lon)}\"\"\")\n", + " p = Point(longitude, latitude) # the order matters\n", + " for iso, rec, shp in _shape_countries:\n", + " if shp.contains(p):\n", + " return find_country(iso)\n", + " return find_country('99')\n", + "\n", + "def test_find_country_from_coordinates():\n", + " assert find_country_from_coordinates(52, 5).alpha3 == 'NLD'\n" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [], + "source": [ + "# Country finder \n", + "\n", + "def find_country(string, verbose=True):\n", + " try:\n", + " # make sure input is a string, all caps\n", + " string = normalize_text(str(string).strip()).casefold()\n", + " # Make sure there is enough data for a positive identification\n", + " assert len(string) > 1\n", + " except (AttributeError, AssertionError) as e:\n", + " # if this fails, return the country equivalent of None\n", + " return find_country('99')\n", + "\n", + " try:\n", + " # use a library to catch the most common cases\n", + " hit = pycountry.countries.lookup(string)\n", + " string = hit.alpha_2.casefold()\n", + " except LookupError:\n", + " pass\n", + "\n", + " for idx, row in _countries.iterrows():\n", + " for value in row.values:\n", + " value = normalize_text(str(value).strip()).casefold()\n", + " if string == value:\n", + " return row\n", + " if string in row['synonyms']:\n", + " return row\n", + "\n", + " # print failures unless verbose=False\n", + " if verbose:\n", + " print(string)\n", + " return find_country('99')\n", + "\n", + "def bulk_find_country(l, field='alpha2', verbose=False):\n", + " \"\"\"Translate a long list/pd.Series of country names.\n", + " All unique values are converted once to a dictionary, \n", + " then the list/pd.Series is translated using the \n", + " dictionary and returned.\n", + " \"\"\"\n", + " assert isinstance(l, (list, pd.Series))\n", + " assert field in _countries.columns\n", + " if isinstance(l, list):\n", + " unique = set(l)\n", + " conversion_dict = { u: find_country(u, verbose=verbose) for u in unique }\n", + " converted = [ conversion_dict.get(country) for country in l ]\n", + " get_field = [ getattr(country, field) for country in converted ]\n", + " return get_field\n", + " elif isinstance(l, pd.Series):\n", + " unique = l.unique()\n", + " conversion_dict = { u: find_country(u, verbose=verbose) for u in unique }\n", + " converted = [ conversion_dict.get(country) for country in l ]\n", + " get_field = [ getattr(country, field) for country in converted ]\n", + " return pd.Series(get_field, index=l.index)" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [], + "source": [ + "# find_country(\"DEMOCRATIC REPUBLIC OF THE CONGO\")\n", + "# find_country('China, Hong Kong SAR')\n", + "# import pandas as pd\n", + "# s = pd.Series(['NLD', 'BEL'], index=[1,2])\n", + "# bulk_find_country(s, field='alpha2')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# logging\n", + "\n", + "def get_logger(name):\n", + " \"\"\"Return a logger object. Use logger.info() to log lines\"\"\"\n", + " logging.basicConfig(level=logging.DEBUG)\n", + " # create a file handler\n", + " logfile = os.getcwd() + '\\\\' + name + '.log'\n", + " print('Logging to file {}'.format(logfile))\n", + " handler = logging.FileHandler(logfile)\n", + " handler.setLevel(logging.DEBUG)\n", + " # create a logging format and assign\n", + " formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)-5s - %(message)s')\n", + " handler.setFormatter(formatter)\n", + " # create a logger and assign handler\n", + " logger = logging.getLogger(name)\n", + " logger.addHandler(handler)\n", + " return logger" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class Excel:\n", + " \"\"\"Return an Excel file with filter and sort enabled. \n", + " Logs edits to an info sheet when it is updated \n", + " and helps add sheets without overwriting.\"\"\"\n", + " def __init__(self, filePath, info=None, verbose=True):\n", + " self.f = filePath\n", + " self.get_book()\n", + " self.verbose=verbose\n", + " \n", + " def get_book(self):\n", + " \"\"\"Return existing or create new at filePath location.\"\"\"\n", + " try:\n", + " book = openpyxl.load_workbook(self.f)\n", + " msg = 'File loaded.'\n", + " except FileNotFoundError:\n", + " book = openpyxl.Workbook()\n", + " msg = 'File created.'\n", + " book.worksheets[0].title = 'info'\n", + " book.save(self.f)\n", + " self.book = book\n", + " info = self.get_sheet('info')\n", + " info.sheet_properties.tabColor = \"1072BA\"\n", + " self.log('info', msg)\n", + " \n", + " def get_sheet(self, sheet_name):\n", + " \"\"\"Return existing or create new sheet in the workbook.\"\"\"\n", + " try:\n", + " return self.book[sheet_name]\n", + " except KeyError:\n", + " return self.book.create_sheet(sheet_name)\n", + " \n", + " def image_to_sheet(self, image, sheet_name=\"Image\", anchor='A1'):\n", + " ws = self.get_sheet(sheet_name)\n", + " img = openpyxl.drawing.image.Image(image)\n", + " ws.add_image(img, anchor=anchor)\n", + " self.log('info', f'Image added to to sheet {sheet_name}')\n", + " self.save()\n", + " \n", + " def log(self, sheet_name, *args):\n", + " \"\"\"Append row with any number of columns at the end of a sheet. \n", + " First column is the timestamp of addition.\"\"\"\n", + " row = [datetime.now().strftime('%Y-%m-%d %H:%M')]\n", + " row.extend(args)\n", + " sheet = self.get_sheet(sheet_name)\n", + " sheet.append(row)\n", + " self.save()\n", + " \n", + " def overwrite_sheet(self, sheet_name, df):\n", + " \"\"\"Write pandas DataFrame to sheet. Overwrite if existing.\"\"\"\n", + " with pd.ExcelWriter(self.f, engine='openpyxl') as writer:\n", + " writer.book = openpyxl.load_workbook(self.f)\n", + " if sheet_name in writer.book.sheetnames:\n", + " del writer.book[sheet_name]\n", + " df.to_excel(writer, sheet_name)\n", + " self.book = writer.book\n", + " writer.save()\n", + " \n", + " def write(self, df, sheet_name=\"Data\", msg=\"\", filters=True):\n", + " \"\"\"Write a df to a named sheet (Data) in the workbook.\"\"\"\n", + " info = 'Writing data to sheet {}.'.format(sheet_name)\n", + " self.log('info', info, msg)\n", + " self.overwrite_sheet(sheet_name, df)\n", + " if filters:\n", + " sheet = self.get_sheet(sheet_name)\n", + " full_sheet = \"A1:\" + get_column_letter(sheet.max_column) + str(sheet.max_row)\n", + " self.book[sheet_name].auto_filter.ref = full_sheet\n", + " sheet.freeze_panes = 'A2'\n", + " self.save()\n", + " self.close()\n", + " if self.verbose:\n", + " print('{}\\nFile: {}'.format(info, self.f))\n", + "\n", + " def save(self):\n", + " while True:\n", + " try:\n", + " self.book.save(self.f)\n", + " break\n", + " except PermissionError:\n", + " print(f\"\"\"No permission to write to file: {os.getcwd()}/{self.f}.\n", + " Please close the file in order to save.\n", + " If the error persists, you don't have permission to write.\"\"\")\n", + " sleep(3)\n", + " \n", + " def close(self):\n", + " self.book.close()\n", + " \n", + " def to_df(self, sheet_name):\n", + " \"\"\"Return a sheet as pandas DataFrame\"\"\"\n", + " try:\n", + " df = pd.read_excel(self.f, encoding = 'utf8', sheet_name=sheet_name, engine='openpyxl')\n", + " except XLRDError:\n", + " df = pd.DataFrame()\n", + " return df\n", + " \n", + " def to_dict(self):\n", + " \"\"\"Return the entire workbook as a dict of pandas DataFrames\"\"\"\n", + " d = pd.read_excel(self.f, encoding = 'utf8', sheet_name=None, engine='openpyxl')\n", + " return d" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# quick translate\n", + "\n", + "def translate(query, to_lang='en', file_path=None, meta=False, verbose=True):\n", + " \"\"\"Get translations to English, from any language,\n", + " from the Google Translate API. \n", + " \n", + " Arguments:\n", + " query = String or list of strings\n", + " to_lang = Sets the language to translate to. \n", + " Default: 'en' for English.\n", + " file_path = String: File path to store (intermediate) results.\n", + " Default: in the current working directory.\n", + " meta = Boolean: False (default) returns text only.\n", + " True returns Google Translate objects.\n", + " \"\"\"\n", + " \n", + " def validate_input(query):\n", + " \"\"\"Transform input to list of non-empty strings.\"\"\"\n", + " assert isinstance(query, (str, list))\n", + " if isinstance(query, list):\n", + " query = [ str(q) for q in query if len(str(q)) > 0 ]\n", + " else:\n", + " query = [ query ]\n", + " return query\n", + "\n", + " def get_translation(string):\n", + " \"\"\"Query the Google API and return a string of the translation or, \n", + " if meta=True, a JSON string of the translation object.\n", + " \"\"\"\n", + " try:\n", + " response = googletrans.Translator().translate(string, dest=to_lang)\n", + " if meta:\n", + " return json.dumps(response)\n", + " else:\n", + " return response.text\n", + " except Exception as e:\n", + " return exception_handler(string, e)\n", + "\n", + " def exception_handler(string, e):\n", + " if 'request that was too large' in e.__dict__.get('doc', ''):\n", + " # Trim to 3200 character or shorter if necessary\n", + " new_len = min(len(string) - 100, 3200)\n", + " print(f\"Request too large. Trimming query string to {new_len} characters.\")\n", + " string = string[:new_len]\n", + " return get_translation(string)\n", + " elif 'unusual traffic' in e.__dict__.get('doc', ''):\n", + " print(f\"Google API: daily limit reached. Try again in 24h.\")\n", + " return f\"<untranslated: {string}>\"\n", + " else:\n", + " raise e\n", + " \n", + " def read_result(file_path):\n", + " excel = Excel(file_path)\n", + " df = excel.to_df(\"Data\")\n", + " d = df.set_index('query').to_dict()['result']\n", + " return d\n", + " \n", + " def store_result(result, newly_translated):\n", + " excel = Excel(file_path, verbose=False)\n", + " df = pd.DataFrame(data={\n", + " 'query': list(result.keys()),\n", + " 'result': list(result.values()),\n", + " })\n", + " msg = f\"New queries: {newly_translated}.\"\n", + " excel.write(df, msg=msg, sheet_name=\"Data\")\n", + " \n", + " def main(query):\n", + " \"\"\"Run translation and handle exceptions.\"\"\"\n", + " newly_translated = 0\n", + " query_list = validate_input(query)\n", + "# if file_path and not file_path.endswith(\".xlsx\"):\n", + "# file_path = f\"{file_path}_fixed.xlsx\"\n", + " if file_path and os.path.isfile(file_path):\n", + " if verbose:\n", + " print(f\"Using translations from file: {file_path}.\")\n", + " # Check previous translations using this file\n", + " result = read_result(file_path)\n", + "# # Use only those that are also present in the current query\n", + "# result = { k: v for k, v in all_results.items() if k in query_list }\n", + " else:\n", + " result = {}\n", + " # Wrap the translation in a try loop so that store_result()\n", + " # is always triggered, in order to store the progress.\n", + " # raise(e) is raised AFTER the 'finally:' clause.\n", + " try:\n", + " for string in query_list:\n", + " if string not in result:\n", + " result[string] = get_translation(string)\n", + " newly_translated += 1\n", + " except Exception as e:\n", + " raise(e)\n", + " finally:\n", + " if file_path:\n", + " store_result(result, newly_translated)\n", + " if len(query_list) == 1:\n", + " return result.get(query_list[0], f\"<untranslated: {query_list[0]}>\")\n", + " else:\n", + " return result\n", + "\n", + " return main(query)\n", + "\n", + "def test_translate():\n", + " query = [ \"Nederlandse\", \"Kaas\", \"Tarwe\", \"Huis\" ]\n", + " expected_translation = [ \"Dutch\", \"Cheese\", \"Wheat\", \"House\" ]\n", + " try: \n", + " expected_result = dict(zip(query, expected_translation))\n", + " true_result = translate(query)\n", + " assert expected_result == true_result\n", + " except AssertionError:\n", + " print(f\"Query:\\n {query}\")\n", + " print(f\"Expected translations:\\n {expected_translation}\")\n", + " print(f\"Expected translation() result:\\n {expected_result}\")\n", + " print(f\"Actual translation() result:\\n {true_result}\")\n", + " \n", + "# test_translate()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# test_translate()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/wfsr/tools.ipynb b/wfsr/tools.ipynb index a8d1c485e295308b65147bc3ad0070efdb26070a..65deae868c3900a124b5bc79428dfbd159bc1a60 100644 --- a/wfsr/tools.ipynb +++ b/wfsr/tools.ipynb @@ -422,14 +422,14 @@ " def to_df(self, sheet_name):\n", " \"\"\"Return a sheet as pandas DataFrame\"\"\"\n", " try:\n", - " df = pd.read_excel(self.f, encoding = 'utf8', sheet_name=sheet_name)\n", + " df = pd.read_excel(self.f, encoding = 'utf8', sheet_name=sheet_name, engine='openpyxl')\n", " except XLRDError:\n", " df = pd.DataFrame()\n", " return df\n", " \n", " def to_dict(self):\n", " \"\"\"Return the entire workbook as a dict of pandas DataFrames\"\"\"\n", - " d = pd.read_excel(self.f, encoding = 'utf8', sheet_name=None)\n", + " d = pd.read_excel(self.f, encoding = 'utf8', sheet_name=None, engine='openpyxl')\n", " return d" ] }, @@ -582,7 +582,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.1" + "version": "3.7.3" } }, "nbformat": 4,