#!/usr/bin/env python3
#
# Translate text from standard input using machine translation services.
# Copyright (c) 2019-2020, Hiroyuki Ohsaki.
# All rights reserved.
#
# $Id: pytrans,v 1.13 2020/10/01 05:15:39 ohsaki Exp $
#
import json
import os
import re
import subprocess
import sys
import urllib
import urllib.parse
import urllib.request
from perlcompat import die, warn, getopts, require
import htmltree
import tbdump
# FIXME: support automatic proxy configuration
# os.environ['http_proxy'] = 'http://proxy:8080'
# os.environ['https_proxy'] = 'http://proxy:8080'
PREFILTER_TBL = {}
POSTFILTER_TBL = {}
USER_AGENT = 'Mozilla/5.0'
DEEPL_AUTH_KEY = '********-****-****-****-************'
def usage():
die(f"""\
usage: {sys.argv[0]} [-d] [file...]
-d debug mode
""")
def load_rcfile():
home = os.getenv('HOME')
rc_file = f'{home}/.pytransrc'
try:
with open(rc_file) as f:
code = f.read()
except FileNotFoundError:
return None
try:
exec(code)
except:
die(f"executing '{rc_file}' failed. aborting...")
exit()
def mask_labels(labels, text):
"""Replace all LaTeX commands (equations, references, and citations) with
a plain string (e.g., L0001) to avoid confusion in translation engines."""
def _register_label(m):
astr = m.group(0)
label = f'(L{len(labels) + 1:04})'
labels[label] = astr
return label
text = re.sub(r'\$([^\$]+)\$', _register_label, text)
text = re.sub(r'\\ref{[^}]+}', _register_label, text)
text = re.sub(r'\\cite{[^}]+}', _register_label, text)
text = re.sub(
r'\n[ \t]*\\begin{(align|eqnarray)\*?}.+?\\end{(align|eqnarray)\*?} *\n',
_register_label,
text,
flags=re.DOTALL)
return text
def unamsk_labels(labels, text):
"""Revert all masked LaTeX commands with the original ones."""
for before, after in labels.items():
text = text.replace(before, after)
return text
def prefilter(labels, text):
"""Format the original string TEXT before translation. Masked strings are
store in LABELS as dictionary."""
for before, after in PREFILTER_TBL.items():
text = text.replace(before, after)
text = mask_labels(labels, text)
text = text.replace('\n', '')
return text
def postfilter(labels, text):
"""Format the translated string TEXT. All masked strings are reverted."""
for before, after in POSTFILTER_TBL.items():
text = text.replace(before, after)
# workaround for Google translate
text = re.sub(r'L +(\d{4})', r'L\1', text)
text = unamsk_labels(labels, text)
return text
def fetch(url, data=None):
"""Retrieve the content at URL using urllib.request module."""
req = urllib.request.Request(url,
data=data,
headers={'User-Agent': USER_AGENT})
# FIXME: implement error handling
resp = urllib.request.urlopen(req)
return resp.read().decode('utf-8')
# DeepL ----------------------------------------------------------------
def translate_deepl(text):
"""Translate the string TEXT via DeepL API."""
quoted = urllib.parse.quote(text)
# FIXME: fails if the quoted text is lengthy
url = f'https://api.deepl.com/v2/translate?auth_key={DEEPL_AUTH_KEY}&text={quoted}&target_lang=EN'
resp = fetch(url)
# parse the output to extract the translated text
obj = json.loads(resp)
return obj['translations'][0]['text'] if obj else None
# google ----------------------------------------------------------------
def translate_google(text):
"""Translate the string TEXT via Google Translate."""
quoted = urllib.parse.quote(text)
# FIXME: fails if the quoted text is lengthy
url = f'https://translate.google.com/m?hl=ja&sl=ja&tl=en&ie=UTF-8&prev=_m&q={quoted}'
html = fetch(url)
# parse the output to extract the translated text
root = htmltree.Node()
root.parse_string(html)
node = root.find('div', 'class=t0')
return node.as_string() if node else None
# amazon ----------------------------------------------------------------
def translate_amazon(text):
"""Translate the string TEXT via AWS Amazon Translate service."""
# FIXME: text must be less than 5000 bytes
cmd = f'aws translate translate-text --source-language-code ja --target-language-code en --output json --text "{text}"'
output = subprocess.getoutput(cmd)
# parse the output to extract the translated text
# FIXME: must check the output validity
obj = json.loads(output)
return obj['TranslatedText'] if obj else None
# FIXME: support language detection and English-to-Japanese translation
def translate(buf, engine='Google'):
if engine == 'DeepL':
return translate_deepl(buf)
elif engine == 'Google':
return translate_google(buf)
elif engine == 'Amazon':
return translate_amazon(buf)
def main():
opt = getopts('d') or usage()
debug = opt.d
load_rcfile()
buf = sys.stdin.read()
labels = {}
buf = prefilter(labels, buf)
for engine in ['DeepL', 'Google', 'Amazon']:
translated = translate(buf, engine)
translated = postfilter(labels, translated)
print(engine + ':\n' + translated + '\n')
if debug:
print('% ' + buf)
if __name__ == "__main__":
main()