Tutorial · Python · Node.js · 2026

How to Translate PDF Documents with an API — Complete Developer Guide 2026

Extract text from PDFs and translate it programmatically using Python (PyMuPDF) and Node.js (pdf-parse). Full code examples, chunking strategies, and cost estimates.

Updated March 2026 · 15 min read

The PDF Translation Challenge

PDFs are everywhere in business: contracts, technical manuals, research papers, financial reports. When you need to make these documents available in multiple languages, you face a unique set of challenges that don't exist with plain text translation:

This guide shows you exactly how to solve all of these challenges using SocketsIO Translation API with Python and Node.js. SocketsIO supports 195 languages, offers 500,000 characters/month free (no credit card), and costs just $3.50 per million characters — making it the most cost-effective choice for high-volume PDF translation.

💡 What this guide covers: We'll extract text from PDFs, chunk it intelligently, translate it via API, and output a translated text file. For full PDF reconstruction (preserving layout), you'd need a PDF editing library like ReportLab (Python) or pdf-lib (Node.js) — which is beyond this guide's scope but the translation logic is identical.

Python Approach: PyMuPDF + SocketsIO API

PyMuPDF (imported as fitz) is the best Python library for PDF text extraction. It is fast, accurate, and handles complex layouts including multi-column text and tables.

Installation

pip install PyMuPDF requests

Basic PDF Text Extraction

import fitz  # PyMuPDF

def extract_pdf_text(pdf_path):
    # Extract text from each page of a PDF.
    # Returns a list of dicts with page and text keys.
    doc = fitz.open(pdf_path)
    pages = []

    for page_num in range(len(doc)):
        page = doc[page_num]
        text = page.get_text("text")  # Plain text extraction

        if text.strip():  # Skip empty pages
            pages.append({
                'page': page_num + 1,
                'text': text.strip(),
                'char_count': len(text)
            })

    doc.close()
    return pages

# Example usage
pages = extract_pdf_text('document.pdf')
print(f"Extracted {len(pages)} pages")
for p in pages[:3]:
    print(f"Page {p['page']}: {p['char_count']} chars")

Translating Each Page

import fitz
import requests
from pathlib import Path

API_KEY = 'YOUR_SOCKETSIO_API_KEY'
API_URL = 'https://api.socketsio.com/v1/translate'

def translate_text(text, target, source='auto'):
    response = requests.post(
        API_URL,
        headers={
            'Authorization': f'Bearer {API_KEY}',
            'Content-Type': 'application/json',
        },
        json={'q': text, 'target': target, 'source': source},
        timeout=30
    )
    response.raise_for_status()
    return response.json()['data']['translations'][0]['translatedText']

def translate_pdf(pdf_path, target_lang, output_path=None):
    doc = fitz.open(pdf_path)
    translated_pages = []

    print(f"Translating {len(doc)} pages to '{target_lang}'...")

    for page_num in range(len(doc)):
        page = doc[page_num]
        text = page.get_text("text").strip()

        if not text:
            translated_pages.append(f"--- Page {page_num + 1} (empty) ---")
            continue

        print(f"  Page {page_num + 1}/{len(doc)} ({len(text)} chars)...")

        try:
            translated = translate_text(text, target=target_lang)
            translated_pages.append(f"--- Page {page_num + 1} ---\n{translated}")
        except requests.HTTPError as e:
            print(f"  Warning: page {page_num + 1} failed: {e}")
            translated_pages.append(f"--- Page {page_num + 1} (failed) ---\n{text}")

    doc.close()
    full_text = '\n\n'.join(translated_pages)

    if output_path:
        Path(output_path).write_text(full_text, encoding='utf-8')
        print(f"Saved to {output_path}")

    return full_text

# Translate a PDF to Spanish
translate_pdf('contract.pdf', 'es', 'contract_es.txt')
translate_pdf('manual.pdf', 'ja', 'manual_ja.txt')

Chunking Strategy for Large PDFs

The SocketsIO API accepts up to 5,000 characters per request. Most PDF pages are well under this limit, but dense technical documents may exceed it. Here is a robust chunking strategy:

def chunk_text(text, max_chars=4500):
    # Split text into chunks that respect sentence boundaries.
    if len(text) <= max_chars:
        return [text]

    chunks = []
    remaining = text

    while len(remaining) > max_chars:
        # Try paragraph break first
        split_pos = remaining.rfind('\n\n', 0, max_chars)

        if split_pos == -1:
            # Try sentence boundary
            split_pos = remaining.rfind('. ', 0, max_chars)

        if split_pos == -1:
            # Try word boundary
            split_pos = remaining.rfind(' ', 0, max_chars)

        if split_pos == -1:
            split_pos = max_chars

        chunks.append(remaining[:split_pos + 1].strip())
        remaining = remaining[split_pos + 1:].strip()

    if remaining:
        chunks.append(remaining)

    return chunks


def translate_large_page(text, target, source='auto'):
    chunks = chunk_text(text)

    if len(chunks) == 1:
        return translate_text(text, target, source)

    print(f"    Chunked into {len(chunks)} parts")
    translated_chunks = []

    for chunk in chunks:
        translated = translate_text(chunk, target, source)
        translated_chunks.append(translated)

    return ' '.join(translated_chunks)


def translate_pdf_robust(pdf_path, target_lang, output_path=None):
    doc = fitz.open(pdf_path)
    translated_pages = []
    total_chars = 0

    for page_num in range(len(doc)):
        page = doc[page_num]
        text = page.get_text("text").strip()

        if not text:
            continue

        total_chars += len(text)
        translated = translate_large_page(text, target=target_lang)
        translated_pages.append(f"=== Page {page_num + 1} ===\n{translated}")

    doc.close()
    print(f"Total characters: {total_chars:,}")
    print(f"Estimated cost: ${total_chars / 1_000_000 * 3.50:.4f}")

    result = '\n\n'.join(translated_pages)
    if output_path:
        Path(output_path).write_text(result, encoding='utf-8')

    return result

Node.js Approach: pdf-parse + axios

For Node.js developers, pdf-parse provides reliable PDF text extraction. Combined with axios for HTTP requests, you can build a complete PDF translation pipeline in under 100 lines.

Installation

npm install pdf-parse axios

Basic PDF Translation in Node.js

const pdfParse = require('pdf-parse');
const axios = require('axios');
const fs = require('fs');
const path = require('path');

const API_KEY = 'YOUR_SOCKETSIO_API_KEY';
const API_URL = 'https://api.socketsio.com/v1/translate';

async function extractPdfText(pdfPath) {
  const dataBuffer = fs.readFileSync(pdfPath);
  const data = await pdfParse(dataBuffer);
  return {
    text: data.text,
    pages: data.numpages,
    info: data.info,
  };
}

async function translateText(text, target, source = 'auto') {
  const response = await axios.post(
    API_URL,
    { q: text, target, source },
    {
      headers: {
        Authorization: `Bearer ${API_KEY}`,
        'Content-Type': 'application/json',
      },
      timeout: 30000,
    }
  );
  return response.data.data.translations[0].translatedText;
}

function chunkText(text, maxChars = 4500) {
  if (text.length <= maxChars) return [text];

  const chunks = [];
  let remaining = text;

  while (remaining.length > maxChars) {
    // Try paragraph break
    let splitPos = remaining.lastIndexOf('\n\n', maxChars);
    if (splitPos === -1) splitPos = remaining.lastIndexOf('. ', maxChars);
    if (splitPos === -1) splitPos = remaining.lastIndexOf(' ', maxChars);
    if (splitPos === -1) splitPos = maxChars;

    chunks.push(remaining.slice(0, splitPos + 1).trim());
    remaining = remaining.slice(splitPos + 1).trim();
  }

  if (remaining) chunks.push(remaining);
  return chunks;
}

async function translatePdf(pdfPath, targetLang, outputPath) {
  console.log(`Extracting text from ${pdfPath}...`);
  const { text, pages } = await extractPdfText(pdfPath);

  console.log(`Extracted ${pages} pages, ${text.length.toLocaleString()} chars`);
  console.log(`Translating to '${targetLang}'...`);

  const chunks = chunkText(text);
  const translatedChunks = [];

  for (let i = 0; i < chunks.length; i++) {
    console.log(`  Chunk ${i + 1}/${chunks.length} (${chunks[i].length} chars)...`);
    const translated = await translateText(chunks[i], targetLang);
    translatedChunks.push(translated);
  }

  const translatedText = translatedChunks.join(' ');
  const cost = (text.length / 1_000_000 * 3.50).toFixed(4);
  console.log(`Done! Estimated cost: $${cost}`);

  if (outputPath) {
    fs.writeFileSync(outputPath, translatedText, 'utf8');
    console.log(`Saved to ${outputPath}`);
  }

  return translatedText;
}

// Usage
translatePdf('document.pdf', 'es', 'document_es.txt')
  .catch(console.error);

Multi-Language Batch Translation

Enterprise workflows often need the same PDF translated into multiple languages simultaneously. Here is how to translate a single PDF into 5 languages in parallel:

// Translate one PDF into multiple languages concurrently
async function translatePdfMultiLanguage(pdfPath, targetLanguages) {
  const { text } = await extractPdfText(pdfPath);
  const chunks = chunkText(text);

  console.log(`Translating ${chunks.length} chunks into ${targetLanguages.length} languages...`);

  // Process each language in parallel
  const results = await Promise.all(
    targetLanguages.map(async (lang) => {
      const translatedChunks = await Promise.all(
        chunks.map((chunk) => translateText(chunk, lang))
      );
      return { lang, text: translatedChunks.join(' ') };
    })
  );

  // Save each language to a file
  const baseName = path.basename(pdfPath, '.pdf');
  for (const { lang, text: translated } of results) {
    const outputPath = `${baseName}_${lang}.txt`;
    fs.writeFileSync(outputPath, translated, 'utf8');
    console.log(`Saved: ${outputPath}`);
  }

  const totalCost = (text.length * targetLanguages.length / 1_000_000 * 3.50).toFixed(4);
  console.log(`Total estimated cost for all languages: $${totalCost}`);
}

// Translate to 5 languages at once
translatePdfMultiLanguage('product_manual.pdf', ['es', 'fr', 'de', 'ja', 'zh']);

Cost Estimation for PDF Translation

Before running a large translation job, it is useful to estimate the cost. Here is a breakdown based on typical document sizes:

Document Type Avg Pages Avg Characters SocketsIO Cost Google Cost DeepL Cost
1-page contract 1 2,000 $0.007 $0.040 $0.050
Standard PDF 5 12,000 $0.042 $0.240 $0.300
Technical manual 50 120,000 $0.42 $2.40 $3.00
Legal document 200 500,000 $1.75 $10.00 $12.50
100 PDFs/month 5 each 1.2M chars $4.20 $24.00 $30.00

The average 5-page PDF contains approximately 12,000 characters. At SocketsIO's rate of $3.50/million characters, that is just $0.042 per PDF — compared to $0.24 with Google Translate or $0.30 with DeepL.

CLI Tool for PDF Translation

Here is a complete command-line tool that wraps the translation logic into a reusable script:

#!/usr/bin/env python3
# translate_pdf.py — CLI tool for PDF translation
# Usage: python translate_pdf.py input.pdf --target es --output output_es.txt

import argparse
import sys
import fitz
import requests
from pathlib import Path

API_KEY = 'YOUR_SOCKETSIO_API_KEY'
API_URL = 'https://api.socketsio.com/v1/translate'

SUPPORTED_LANGUAGES = {
    'es': 'Spanish', 'fr': 'French', 'de': 'German',
    'ja': 'Japanese', 'zh': 'Chinese', 'ko': 'Korean',
    'pt': 'Portuguese', 'ar': 'Arabic', 'it': 'Italian',
    'ru': 'Russian', 'nl': 'Dutch', 'sv': 'Swedish',
}

def main():
    parser = argparse.ArgumentParser(description='Translate PDF documents using SocketsIO API')
    parser.add_argument('input', help='Input PDF file path')
    parser.add_argument('--target', '-t', required=True, help='Target language code (e.g., es, fr, ja)')
    parser.add_argument('--source', '-s', default='auto', help='Source language code (default: auto-detect)')
    parser.add_argument('--output', '-o', help='Output file path (default: input_LANG.txt)')
    parser.add_argument('--languages', '-l', action='store_true', help='List supported languages')
    args = parser.parse_args()

    if args.languages:
        print("Supported languages:")
        for code, name in SUPPORTED_LANGUAGES.items():
            print(f"  {code}: {name}")
        sys.exit(0)

    input_path = Path(args.input)
    if not input_path.exists():
        print(f"Error: File not found: {args.input}", file=sys.stderr)
        sys.exit(1)

    output_path = args.output or f"{input_path.stem}_{args.target}.txt"

    translate_pdf_robust(str(input_path), args.target, output_path)
    print(f"\nTranslation complete: {output_path}")

if __name__ == '__main__':
    main()

Usage examples:

# Translate to Spanish
python translate_pdf.py contract.pdf --target es

# Translate to Japanese with explicit source
python translate_pdf.py manual.pdf --target ja --source en --output manual_japanese.txt

# List supported languages
python translate_pdf.py --languages

Start Translating PDFs for Free

500,000 characters/month free — enough for ~40 standard PDFs. No credit card required. 195 languages. $3.50/M chars pay-as-you-go.

Get Your Free API Key →

Or test instantly in the API Playground — no signup needed