#!/usr/bin/env python3 # # Translate text from standard input using machine translation services. # Copyright (c) 2019-2020, Hiroyuki Ohsaki. # All rights reserved. # # $Id: pytrans,v 1.13 2020/10/01 05:15:39 ohsaki Exp $ # import json import os import re import subprocess import sys import urllib import urllib.parse import urllib.request from perlcompat import die, warn, getopts, require import htmltree import tbdump # FIXME: support automatic proxy configuration # os.environ['http_proxy'] = 'http://proxy:8080' # os.environ['https_proxy'] = 'http://proxy:8080' PREFILTER_TBL = {} POSTFILTER_TBL = {} USER_AGENT = 'Mozilla/5.0' DEEPL_AUTH_KEY = '********-****-****-****-************' def usage(): die(f"""\ usage: {sys.argv[0]} [-d] [file...] -d debug mode """) def load_rcfile(): home = os.getenv('HOME') rc_file = f'{home}/.pytransrc' try: with open(rc_file) as f: code = f.read() except FileNotFoundError: return None try: exec(code) except: die(f"executing '{rc_file}' failed. aborting...") exit() def mask_labels(labels, text): """Replace all LaTeX commands (equations, references, and citations) with a plain string (e.g., L0001) to avoid confusion in translation engines.""" def _register_label(m): astr = m.group(0) label = f'(L{len(labels) + 1:04})' labels[label] = astr return label text = re.sub(r'\$([^\$]+)\$', _register_label, text) text = re.sub(r'\\ref{[^}]+}', _register_label, text) text = re.sub(r'\\cite{[^}]+}', _register_label, text) text = re.sub( r'\n[ \t]*\\begin{(align|eqnarray)\*?}.+?\\end{(align|eqnarray)\*?} *\n', _register_label, text, flags=re.DOTALL) return text def unamsk_labels(labels, text): """Revert all masked LaTeX commands with the original ones.""" for before, after in labels.items(): text = text.replace(before, after) return text def prefilter(labels, text): """Format the original string TEXT before translation. Masked strings are store in LABELS as dictionary.""" for before, after in PREFILTER_TBL.items(): text = text.replace(before, after) text = mask_labels(labels, text) text = text.replace('\n', '') return text def postfilter(labels, text): """Format the translated string TEXT. All masked strings are reverted.""" for before, after in POSTFILTER_TBL.items(): text = text.replace(before, after) # workaround for Google translate text = re.sub(r'L +(\d{4})', r'L\1', text) text = unamsk_labels(labels, text) return text def fetch(url, data=None): """Retrieve the content at URL using urllib.request module.""" req = urllib.request.Request(url, data=data, headers={'User-Agent': USER_AGENT}) # FIXME: implement error handling resp = urllib.request.urlopen(req) return resp.read().decode('utf-8') # DeepL ---------------------------------------------------------------- def translate_deepl(text): """Translate the string TEXT via DeepL API.""" quoted = urllib.parse.quote(text) # FIXME: fails if the quoted text is lengthy url = f'https://api.deepl.com/v2/translate?auth_key={DEEPL_AUTH_KEY}&text={quoted}&target_lang=EN' resp = fetch(url) # parse the output to extract the translated text obj = json.loads(resp) return obj['translations'][0]['text'] if obj else None # google ---------------------------------------------------------------- def translate_google(text): """Translate the string TEXT via Google Translate.""" quoted = urllib.parse.quote(text) # FIXME: fails if the quoted text is lengthy url = f'https://translate.google.com/m?hl=ja&sl=ja&tl=en&ie=UTF-8&prev=_m&q={quoted}' html = fetch(url) # parse the output to extract the translated text root = htmltree.Node() root.parse_string(html) node = root.find('div', 'class=t0') return node.as_string() if node else None # amazon ---------------------------------------------------------------- def translate_amazon(text): """Translate the string TEXT via AWS Amazon Translate service.""" # FIXME: text must be less than 5000 bytes cmd = f'aws translate translate-text --source-language-code ja --target-language-code en --output json --text "{text}"' output = subprocess.getoutput(cmd) # parse the output to extract the translated text # FIXME: must check the output validity obj = json.loads(output) return obj['TranslatedText'] if obj else None # FIXME: support language detection and English-to-Japanese translation def translate(buf, engine='Google'): if engine == 'DeepL': return translate_deepl(buf) elif engine == 'Google': return translate_google(buf) elif engine == 'Amazon': return translate_amazon(buf) def main(): opt = getopts('d') or usage() debug = opt.d load_rcfile() buf = sys.stdin.read() labels = {} buf = prefilter(labels, buf) for engine in ['DeepL', 'Google', 'Amazon']: translated = translate(buf, engine) translated = postfilter(labels, translated) print(engine + ':\n' + translated + '\n') if debug: print('% ' + buf) if __name__ == "__main__": main()