{updated version below}
Hi all translators, I’ve whipped up a little script in my spare time with the help of ChatGPT that takes untranslated strings pipes them through Google Translate and writes the target translations.
From the Transifix Dashboard choose the language package you want to translate, choose to “Download untranslated strings as XLIFF” and save it in same dir as the script, files will translate to ./output/lang_code/file
Note that it’s not perfect, I probably haven’t accounted for all variable types yet, sometimes it throws errors that i suppose could be silenced or otherwise handled better, etc but on the samples I’ve tried so far it worked (mostly) great. Occasionally extra spaces are added or new-lines are not preserved and so you absolutely have to still manually verify each line in the file, but at least the bulk of the text should be translated making it easy to just touch up the files instead of typing out each translation in it’s entirety.
I hereby release my efforts to the community to choose to use or loose it feel free to do with it as you please
xlf-translator.py (6.3 KB)
Requires: googletrans==4.0.0rc1
import os
import re
from googletrans import Translator
from xml.etree import ElementTree as ET
# Define the regex patterns for replacements
regex_patterns = [
r'%\([^)]+\)[ds]', # Match all variables like %(variable), %(variable)d or %(variable)s
r'AM|PM', # Match AM or PM for time formats
r'Open edX|edX', # Match the literal string "Open edX" or "edX"
r'\b[A-Z_]{2,}\b', # Match all-uppercase words with underscores (e.g., DATE_TIME, DATE_TIME_FORMAT)
r'\{([^{}]+)\}', # Match variable enclosed in {curly-braces}
r'%=\s*(.*?)\s*%', # Match variables enclosed in "%=var%" eg %=percent%
]
translator = Translator()
# Function to find and store exact matches of the regex patterns in the source text
def find_exact_matches(source_text):
exact_matches = []
for pattern in regex_patterns:
matches = re.finditer(pattern, source_text)
for match in matches:
exact_matches.append(match.group(0)) # Capture the entire matched pattern
return exact_matches
# Function to translate a sentence while preserving spaces
def translate_sentence(sentence, target_language):
# Find exact matches of regex patterns in the sentence
exact_matches = find_exact_matches(sentence)
# Modify the sentence using regex patterns
for i, match in enumerate(exact_matches, start=1):
sentence = sentence.replace(match, f'{{{i}}}')
try:
# Attempt to translate the modified sentence
translation = translator.translate(sentence, src="en", dest=target_language)
# Check if the translation result is not None
if translation is not None:
target_text = translation.text
else:
print("Translation failed. Using the original sentence.")
target_text = sentence # Use the original sentence as the translation
except Exception as e:
print(f"Translation failed. Error: {e}")
print("Using the original sentence.")
target_text = sentence # Use the original sentence in case of an error
# Replace placeholders in the translated text
for i, match in enumerate(exact_matches, start=1):
target_text = target_text.replace(f'{{{i}}}', match)
return target_text
# Modify the determine_target_language function to trim the country code if necessary
def determine_target_language(root):
target_language = None
file_element = root.find('.//file')
if file_element is not None:
target_language = file_element.attrib.get('target-language')
if target_language:
# Check if the target language contains a hyphen
if '-' in target_language:
# Split by the hyphen and use the first part as the target language
target_language = target_language.split('-')[0]
return target_language
# Function to ask the user for the target language if it couldn't be determined
def ask_user_for_target_language():
print("Unable to validate the target language.")
print("Please enter a supported ISO 639-1 language code, e.g.:")
print(" 'hr': 'croatian'")
print(" 'cs': 'czech'")
print(" 'da': 'danish'")
# ... (other supported languages)
target_language = input("Enter the ISO 639-1 language code: ")
return target_language
# Function to process and translate a single .xlf file
def process_xlf_file(file_path):
print(f"Processing file: {file_path}")
tree = ET.parse(file_path)
root = tree.getroot()
# Determine the target language from the XML file
target_language = determine_target_language(root)
print(f"Detected target language: {target_language}")
if not target_language:
# Attempt to use the primary language code
if target_language and '-' in target_language:
target_language = target_language.split('-')[0]
print(f"Target language after split: {target_language}")
if not target_language:
target_language = ask_user_for_target_language()
print(f"Manually entered target language: {target_language}")
# Create the output directory
original = root.attrib.get('original')
if original:
output_dir = os.path.join(os.getcwd(), 'output', target_language, original)
else:
output_dir = os.path.join(os.getcwd(), 'output', target_language)
os.makedirs(output_dir, exist_ok=True)
print(f"Output directory: {output_dir}")
total_strings = len(root.findall('.//trans-unit'))
current_string = 0
for trans_unit in root.findall('.//trans-unit'):
current_string += 1
source_text = trans_unit.find('.//source').text
target_text_element = trans_unit.find('.//target')
if target_text_element is not None:
target_text = target_text_element.text
else:
target_text = ''
if not target_text:
print(f"\nTranslating string {current_string} of {total_strings}\nSource text: {source_text}")
# Split the source text into sentences and translate each sentence
source_sentences = source_text.split('.')
target_sentences = [translate_sentence(sentence, target_language) for sentence in source_sentences]
# Rejoin the translated sentences while preserving spaces after full stops
target_text = '. '.join(target_sentences)
# Update the <target> block
if target_text_element is not None:
target_text_element.text = target_text
else:
new_target = ET.Element('target')
new_target.text = target_text
trans_unit.append(new_target)
print(f"Translated text: {target_text}")
# Save the modified XML for this file to the output file
output_file = os.path.join(output_dir, os.path.basename(file_path))
tree.write(output_file, encoding='utf-8')
print("======================================")
print(f"Saved modified XML to: {output_file}")
# Loop through .xlf files in the current directory (non-recursive)
for file in os.listdir(os.getcwd()):
if file.endswith('.xlf'):
process_xlf_file(os.path.join(os.getcwd(), file))
print("======================================")