Tool for translators

{updated version below}
Hi all translators, I’ve whipped up a little script in my spare time with the help of ChatGPT that takes untranslated strings pipes them through Google Translate and writes the target translations.
From the Transifix Dashboard choose the language package you want to translate, choose to “Download untranslated strings as XLIFF” and save it in same dir as the script, files will translate to ./output/lang_code/file

Note that it’s not perfect, I probably haven’t accounted for all variable types yet, sometimes it throws errors that i suppose could be silenced or otherwise handled better, etc but on the samples I’ve tried so far it worked (mostly) great. Occasionally extra spaces are added or new-lines are not preserved and so you absolutely have to still manually verify each line in the file, but at least the bulk of the text should be translated making it easy to just touch up the files instead of typing out each translation in it’s entirety.

I hereby release my efforts to the community to choose to use or loose it :slight_smile: feel free to do with it as you please

xlf-translator.py (6.3 KB)

Requires: googletrans==4.0.0rc1

import os
import re
from googletrans import Translator
from xml.etree import ElementTree as ET

# Define the regex patterns for replacements
regex_patterns = [
    r'%\([^)]+\)[ds]',  # Match all variables like %(variable), %(variable)d or %(variable)s
    r'AM|PM',           # Match AM or PM for time formats
    r'Open edX|edX',    # Match the literal string "Open edX" or "edX"
    r'\b[A-Z_]{2,}\b',  # Match all-uppercase words with underscores (e.g., DATE_TIME, DATE_TIME_FORMAT)
    r'\{([^{}]+)\}',    # Match variable enclosed in {curly-braces}
    r'%=\s*(.*?)\s*%',  # Match variables enclosed in "%=var%" eg %=percent%
]

translator = Translator()

# Function to find and store exact matches of the regex patterns in the source text
def find_exact_matches(source_text):
    exact_matches = []
    for pattern in regex_patterns:
        matches = re.finditer(pattern, source_text)
        for match in matches:
            exact_matches.append(match.group(0))  # Capture the entire matched pattern
    return exact_matches

# Function to translate a sentence while preserving spaces
def translate_sentence(sentence, target_language):
    # Find exact matches of regex patterns in the sentence
    exact_matches = find_exact_matches(sentence)

    # Modify the sentence using regex patterns
    for i, match in enumerate(exact_matches, start=1):
        sentence = sentence.replace(match, f'{{{i}}}')

    try:
        # Attempt to translate the modified sentence
        translation = translator.translate(sentence, src="en", dest=target_language)

        # Check if the translation result is not None
        if translation is not None:
            target_text = translation.text
        else:
            print("Translation failed. Using the original sentence.")
            target_text = sentence  # Use the original sentence as the translation
    except Exception as e:
        print(f"Translation failed. Error: {e}")
        print("Using the original sentence.")
        target_text = sentence  # Use the original sentence in case of an error

    # Replace placeholders in the translated text
    for i, match in enumerate(exact_matches, start=1):
        target_text = target_text.replace(f'{{{i}}}', match)

    return target_text

# Modify the determine_target_language function to trim the country code if necessary
def determine_target_language(root):
    target_language = None
    file_element = root.find('.//file')
    if file_element is not None:
        target_language = file_element.attrib.get('target-language')

    if target_language:
        # Check if the target language contains a hyphen
        if '-' in target_language:
            # Split by the hyphen and use the first part as the target language
            target_language = target_language.split('-')[0]

    return target_language

# Function to ask the user for the target language if it couldn't be determined
def ask_user_for_target_language():
    print("Unable to validate the target language.")
    print("Please enter a supported ISO 639-1 language code, e.g.:")
    print("    'hr': 'croatian'")
    print("    'cs': 'czech'")
    print("    'da': 'danish'")
    # ... (other supported languages)
    target_language = input("Enter the ISO 639-1 language code: ")
    return target_language

# Function to process and translate a single .xlf file
def process_xlf_file(file_path):
    print(f"Processing file: {file_path}")

    tree = ET.parse(file_path)
    root = tree.getroot()

    # Determine the target language from the XML file
    target_language = determine_target_language(root)
    print(f"Detected target language: {target_language}")

    if not target_language:
        # Attempt to use the primary language code
        if target_language and '-' in target_language:
            target_language = target_language.split('-')[0]
        print(f"Target language after split: {target_language}")

    if not target_language:
        target_language = ask_user_for_target_language()
        print(f"Manually entered target language: {target_language}")

    # Create the output directory
    original = root.attrib.get('original')
    if original:
        output_dir = os.path.join(os.getcwd(), 'output', target_language, original)
    else:
        output_dir = os.path.join(os.getcwd(), 'output', target_language)
    os.makedirs(output_dir, exist_ok=True)
    print(f"Output directory: {output_dir}")

    total_strings = len(root.findall('.//trans-unit'))
    current_string = 0

    for trans_unit in root.findall('.//trans-unit'):
        current_string += 1
        source_text = trans_unit.find('.//source').text
        target_text_element = trans_unit.find('.//target')
        if target_text_element is not None:
            target_text = target_text_element.text
        else:
            target_text = ''

        if not target_text:
            print(f"\nTranslating string {current_string} of {total_strings}\nSource text: {source_text}")

            # Split the source text into sentences and translate each sentence
            source_sentences = source_text.split('.')
            target_sentences = [translate_sentence(sentence, target_language) for sentence in source_sentences]

            # Rejoin the translated sentences while preserving spaces after full stops
            target_text = '. '.join(target_sentences)

            # Update the <target> block
            if target_text_element is not None:
                target_text_element.text = target_text
            else:
                new_target = ET.Element('target')
                new_target.text = target_text
                trans_unit.append(new_target)
            print(f"Translated text: {target_text}")

    # Save the modified XML for this file to the output file
    output_file = os.path.join(output_dir, os.path.basename(file_path))
    tree.write(output_file, encoding='utf-8')
    print("======================================")
    print(f"Saved modified XML to: {output_file}")

# Loop through .xlf files in the current directory (non-recursive)
for file in os.listdir(os.getcwd()):
    if file.endswith('.xlf'):
        process_xlf_file(os.path.join(os.getcwd(), file))
        print("======================================")

3 Likes

For anyone following this I’ve made a new version that relies on a different translator instead of googletrans which seems to be abandonware and was giving me lots of issues.

This one’s still not perfect, sometimes it adds extra spaces or punctuation but I’m too much a dummy to figure it out. anyone is welcome to use or modify as they see fit :slight_smile:
xlf-translator-v2.py (7.1 KB)

requires: pip install translate==3.6.1