l10n_slovnik

#!/usr/bin/python
"""
Script to generate TBX / TMX dictionaries from wiki page.

Specially crafted for Czech l10n team usage.
"""

import re
import sys
from urllib.request import urlopen

from translate.storage.tbx import tbxfile
from translate.storage.tmx import tmxfile

DISKUZE = re.compile(
    r"\(\[http://(lists.ubuntu.cz|lists.l10n.cz)/"
    r"pipermail/diskuze/[^ ]* [^\]]*diskuze\]\)",
)
DISKUZE2 = re.compile(
    r"[; ]*\[http://(lists.ubuntu.cz|lists.l10n.cz)/pipermail/diskuze/"
    r"[^ ]* [^\]]*diskuze\][; ]*",
)

URL = (
    "https://gitlab.com/OpenAlt/L10Ncz/l10n-cz/-/raw/master/_pages/wiki/"
    "Slovn%C3%ADky/P%C5%99ekladatelsk%C3%BD_slovn%C3%ADk.md"
)

ITEMS = []


def clean_target(target):
    """
    Remove not useful things from translation.

    * links to discussions
    * whitespace
    """
    target = DISKUZE.sub("", target)
    target = DISKUZE2.sub("", target)
    return target.strip()


def new_item(source, target):
    """Store new item in dictionary."""
    for tgt in target:
        # We skip not yet decided terms
        if "v diskuzi" in tgt or "zatím nesjednoceno" in tgt:
            return

    # Cleanup translations
    target = [clean_target(tgt) for tgt in target]

    # Split clarification of source word
    if "(" in source and source[-1] == ")":
        source, extra = source.split("(", 1)
        extra = extra.strip().rstrip(")").strip()
    else:
        extra = ""

    source = source.strip()

    # Store new term
    ITEMS.append((source, extra, target))


def process_dict():
    """Download dictionary from wiki and processes words."""
    source = ""
    target = []
    header = True

    # Open URL
    with urlopen(URL) as handle:
        for source_line in handle:
            line = source_line.decode("utf-8").strip()
            if header:
                if line != "# A":
                    continue
                header = False

            if line.startswith("-"):
                # Source string
                if source:
                    # Add new item if we had previous one
                    new_item(source, target)
                    source = ""
                    target = []
                source = line[1:].strip()
            elif line:
                # Translation
                if source == "":
                    continue
                # Skip notes
                if line.startswith(("*Poznámka:*", "Poznámka:", "[")):
                    continue
                if line[0] != "(":
                    target.append(line)
            elif source:
                # End of section/glossary, add word
                new_item(source, target)
                source = ""
                target = []


def write_tbx(name):
    """Generate TBX file from dictionary."""
    store = tbxfile()

    for source, extra, targets in ITEMS:
        if extra != "":
            source = f"{source} ({extra})"  # noqa: PLW2901
        for target in targets:
            if target == "":
                continue
            unit = store.UnitClass(source)
            unit.settarget(target, "cs")
            store.addunit(unit)

    store.savefile(name)


def write_tmx(name):
    """Generate TMX file from dictionary."""
    store = tmxfile()

    for source, extra, targets in ITEMS:
        if extra != "":
            source = f"{source} ({extra})"  # noqa: PLW2901
        for target in targets:
            if target == "":
                continue
            unit = store.UnitClass(source)
            unit.settarget(target, "cs")
            store.addunit(unit)

    store.savefile(name)


if __name__ == "__main__":
    # Check params
    if len(sys.argv) != 2:
        print("Usage: l10n-slovik [file.tbx/tmx]")
        sys.exit(1)

    # Download/process dict
    process_dict()

    # Stupid params handling to save dict
    if "tbx" in sys.argv[1]:
        write_tbx(sys.argv[1])
    elif "tmx" in sys.argv[1]:
        write_tmx(sys.argv[1])
    else:
        print(f"Unkown format: {sys.argv[1]}")
        sys.exit(1)