#!/usr/bin/env python3
"""
Quick prototype for translating PDF and ODT documents using an Ollama chat API.
"""
from abc import ABC, abstractmethod
from pathlib import Path
from typing import Callable, Iterator
import argparse
import colorlog
import html
import json
import logging
import odf
import odf.element
import pymupdf
import re
import requests
import sys
class Transformer(ABC):
@abstractmethod
def transform(self, callback: Callable[[str], str | None]) -> None:
raise NotImplementedError
@abstractmethod
def save(self, outfile: Path) -> None:
raise NotImplementedError
class PdfTransformer(Transformer):
"""
https://github.com/pymupdf/PyMuPDF-Utilities/blob/tutorials/tutorials/language-translation/translator.py
"""
def __init__(self, infile: Path) -> None:
self._doc: pymupdf.Document = pymupdf.Document(infile)
self._ocg = self._doc.add_ocg("Translation", on=True) # named optional content group/layer
def transform(self, callback: Callable[[str], str | None]) -> None:
for page in self._doc: # type: pymupdf.Page
for block in page.get_text("blocks", flags=pymupdf.TEXT_DEHYPHENATE):
if (translation := callback(block[4])) is not None:
bbox = block[:4]
page.draw_rect(bbox, color=None, fill=pymupdf.pdfcolor["white"], oc=self._ocg)
page.insert_htmlbox(bbox, html.escape(translation).replace("\n", "<br>"), oc=self._ocg)
def save(self, outfile: Path) -> None:
self._doc.subset_fonts()
self._doc.save(outfile)
class OdfTransformer(Transformer):
"""
https://github.com/eea/odfpy/wiki/ReplaceOneTextToAnother
"""
def __init__(self, infile: Path) -> None:
self._doc: odf.opendocument.OpenDocument = odf.opendocument.load(infile)
@classmethod
def _saxiter(cls, node: odf.element.Element) -> Iterator[odf.element.Element]:
while node:
yield node
if node.hasChildNodes():
yield from cls._saxiter(node.firstChild)
node = node.nextSibling
def transform(self, callback: Callable[[str], str | None]) -> None:
for elem in self._saxiter(self._doc.topnode):
if elem.__class__ is odf.element.Text:
if isinstance(elem.data, str):
if (translation := callback(elem.data)) is not None:
elem.data = translation
def save(self, outfile: Path) -> None:
self._doc.save(outfile)
class TranslationHistory:
"""
For chat-context, preserve a certain amount of past original and translation i/o as FIFO.
"""
def __init__(self, chunk_size: int) -> None:
self._chunk_size: int = chunk_size
self._text: list[str] = []
self._translation: list[str] = []
def _len(self) -> int:
return sum(len(_) for _ in self._text) + sum(len(_) for _ in self._translation)
def _maintain(self) -> None:
assert len(self._text) == len(self._translation)
while self._len() > self._chunk_size and len(self._text) > 1:
self._text.pop(0)
self._translation.pop(0)
def push(self, text: str, translation: str) -> None:
self._text.append(text)
self._translation.append(translation)
def get(self) -> tuple[str, str]:
self._maintain()
return "\n".join(self._text), "\n".join(self._translation)
class OllamaClient:
"""
Simple requests-based Ollama client for chat with 'translator' prompt and past history.
"""
def __init__(self,
api_url: str, model_name: str,
source_lang: str, target_lang: str, context_title: str,
history_len: int):
self._logger: logging.Logger = logging.getLogger(self.__class__.__name__)
self._model_name: str = model_name
self._api_url: str = api_url
self._session: requests.Session = requests.Session()
self._history: TranslationHistory = TranslationHistory(history_len)
self._context_title: str = context_title
self._source_lang: str = source_lang
self._target_lang: str = target_lang
def _prompt(self) -> str:
prompt: str = f"You are a professional translator. Translate from {self._source_lang} to {self._target_lang}.\n"
prompt += "Preserve formatting and line breaks.\n"
prompt += "Return only the direct translation without comments.\n"
if self._context_title:
prompt += f"Document type: {self._context_title}\n"
return prompt
def _is_hallucination(self, text: str, translation: str) -> bool:
return len(translation) > len(text) * 4
def translate(self, text: str) -> str | None:
messages: list[dict] = [{"role": "system", "content": self._prompt()}]
context_text, context_trans = self._history.get()
if context_text and context_trans:
messages.append({"role": "user", "content": context_text})
messages.append({"role": "assistant", "content": context_trans})
messages.append({"role": "user", "content": text})
self._logger.debug(messages)
try:
response: requests.Response = self._session.post(self._api_url, json={
"model": self._model_name,
"messages": messages,
"stream": False,
}, timeout=(30, 120))
response.raise_for_status()
result: dict = json.loads(response.text)
if "message" not in result or result["message"]["role"] != "assistant":
self._logger.warning("No Ollama assistant response")
return None
translation: str = result["message"]["content"].strip()
if self._is_hallucination(text, translation):
self._logger.warning("Hallucination")
return None
self._history.push(text, translation)
return translation
except requests.exceptions.Timeout:
self._logger.error("Ollama timeout")
return None
except Exception as e:
self._logger.error(f"Unexpected Ollama error: {e}", exc_info=e)
return None
class Chunker:
"""
Split/join sentences to be translated wrt given chunk size.
"""
def __init__(self, cb: Callable[[str], str | None], chunk_len: int) -> None:
self._cb: Callable[[str], str | None] = cb
self._chunk_len: int = chunk_len
def _chunk_sentence(self, text: str) -> Iterator[str]:
"""If too long, try to recursively split at sentence boundaries."""
if len(text) > self._chunk_len:
delim: list[int] = [_.start() for _ in re.finditer("[\n.¡!!¿??::]", text)]
if len(delim) > 1:
pos: int = delim[len(delim) // 2]
yield from self._chunk_sentence(text[:pos])
yield from self._chunk_sentence(text[pos:])
return
yield text
def _pad_whitespace_strip(self, text: str) -> str:
"""Strip possibly splitted input, but re-add whitespace again."""
prefix: str = re.search("^\\s*", text)[0] # type: ignore
suffix: str = re.search("\\s*$", text[len(prefix):])[0] # type: ignore
return "".join((prefix, self._translate(text[len(prefix):-len(suffix)]).strip(), suffix))
def _translate(self, text: str) -> str:
translated: str | None = self._cb(text)
return translated if translated is not None else text # XXX:
def run_iter(self, text: str) -> str | None:
return "".join(self._pad_whitespace_strip(_) for _ in self._chunk_sentence(text))
class Translator:
"""
Main interface providing the string translation callable.
"""
def __init__(self, client: OllamaClient, chunk_len: int) -> None:
self._logger: logging.Logger = logging.getLogger(self.__class__.__name__)
self._chunker: Chunker = Chunker(client.translate, chunk_len)
def translate(self, orig: str) -> str | None:
if not orig.strip() or orig.isnumeric():
return None
self._logger.info(f"> {json.dumps(orig.strip(), ensure_ascii=False)}")
translated: str | None = self._chunker.run_iter(orig)
if translated is None or not translated.strip() or translated == orig:
return None
self._logger.info(f"< {json.dumps(translated.strip(), ensure_ascii=False)}")
return translated
def main(infile: Path, outfile: Path,
api_url: str, model: str,
source_lang: str, target_lang: str, title: str,
history_len: int, chunk_len: int) -> int:
logger: logging.Logger = logging.getLogger(main.__name__)
translator: Translator = Translator(OllamaClient(
api_url=api_url, model_name=model,
source_lang=source_lang, target_lang=target_lang, context_title=title,
history_len=history_len
), chunk_len=chunk_len)
if infile.suffix == ".pdf":
doc: Transformer = PdfTransformer(infile)
elif infile.suffix == ".odt":
doc = OdfTransformer(infile)
else:
logger.error(f"Input file extension not supported: {infile.suffix}")
return 1
try:
doc.transform(translator.translate)
except KeyboardInterrupt:
logger.warning("Interrupt")
doc.save(outfile)
return 1
else:
doc.save(outfile)
return 0
if __name__ == "__main__":
parser = argparse.ArgumentParser(description=__doc__.strip(),
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument("--api", metavar="URL", type=str, default="http://localhost:11434/api/chat",
help="ollama endpoint to use")
parser.add_argument("--model", type=str, required=True, default="llama3.1:latest",
help="ollama model to use")
parser.add_argument("--source-lang", metavar="LANG", type=str, required=True, default="English",
help="original input document language")
parser.add_argument("--target-lang", metavar="LANG", type=str, required=True, default="German",
help="desired translated output language")
parser.add_argument("--title-context", metavar="TITLE", type=str, required=False,
help="document title/context as hinted by prompt")
parser.add_argument("--history-len", metavar="LEN", type=int, default=5000,
help="length of past original/translation to replay as context")
parser.add_argument("--chunk-len", metavar="LEN", type=int, default=5000,
help="try to split by sentence boundaries when input length exceeded")
parser.add_argument("--debug", action="store_const", const=True, default=False,
help="enable debug logging")
parser.add_argument("infile", type=Path)
parser.add_argument("outfile", type=Path)
args = parser.parse_args()
colorlog.basicConfig(level=logging.DEBUG if args.debug else logging.INFO, stream=sys.stderr,
format="%(log_color)s%(levelname)-8s%(reset)s %(name)s: %(message)s")
sys.exit(main(infile=args.infile, outfile=args.outfile,
api_url=args.api, model=args.model,
source_lang=args.source_lang, target_lang=args.target_lang, title=args.title_context,
history_len=args.history_len, chunk_len=args.chunk_len))