Skip to main content

Overview

OCR (Optical Character Recognition) engines extract text from images or text blocks. BallonTranslator supports multiple OCR engines for different languages and use cases.

OCRBase Class

Base class for all OCR modules.

Import

from modules.ocr.base import OCRBase, OCR, register_OCR

Class Definition

class OCRBase(BaseModule):
    """
    Base class for OCR modules.
    
    Supports two modes:
    1. Full image OCR: Returns text string
    2. Block-based OCR: Extracts text from TextBlock regions
    """
    
    _preprocess_hooks = OrderedDict()
    _postprocess_hooks = OrderedDict()
    _line_only = False  # Whether OCR only handles single lines

Methods

run_ocr

run_ocr
method
Run OCR on an image or list of text blocks.Parameters:
  • img (np.ndarray): Input image (RGB or RGBA)
  • blk_list (List[TextBlock] | TextBlock | None): Text blocks to OCR
  • *args, **kwargs: Additional arguments
Returns:
  • If blk_list is None: Returns extracted text as string
  • If blk_list is provided: Returns list of TextBlocks with text attribute set
ocr = OCR['manga_ocr']()

# Full image OCR
text = ocr.run_ocr(img)

# Block-based OCR
blocks_with_text = ocr.run_ocr(img, text_blocks)
for blk in blocks_with_text:
    print(f"Text: {blk.text}")
The run_ocr method automatically:
  • Loads the model if not already loaded
  • Converts RGBA images to RGB
  • Runs postprocessing hooks

_ocr_blk_list (Override Required)

_ocr_blk_list
method
Internal method to OCR a list of text blocks. Must be implemented by subclasses.Parameters:
  • img (np.ndarray): Input image
  • blk_list (List[TextBlock]): Text blocks
  • *args, **kwargs: Additional arguments
Side Effects:
  • Sets blk.text attribute on each TextBlock
def _ocr_blk_list(self, img: np.ndarray, blk_list: List[TextBlock], *args, **kwargs):
    im_h, im_w = img.shape[:2]
    for blk in blk_list:
        x1, y1, x2, y2 = blk.xyxy
        if 0 <= x1 < x2 <= im_w and 0 <= y1 < y2 <= im_h:
            region = img[y1:y2, x1:x2]
            blk.text = self.recognize(region)
        else:
            blk.text = ""

ocr_img (Override Required)

ocr_img
method
OCR a full image. Must be implemented by subclasses.Parameters:
  • img (np.ndarray): Input image
Returns: str - Extracted text
def ocr_img(self, img: np.ndarray) -> str:
    result = self.model(img)
    return result.text

Properties

name
str
OCR engine name from registry. Automatically set during initialization.
_line_only
bool
Whether this OCR engine only handles single-line text.

Available OCR Engines

Manga OCR

Specialized OCR for Japanese manga text.
from modules.ocr.ocr_manga import MangaOCR

ocr = MangaOCR(device='cuda')
text = ocr.run_ocr(img, text_blocks)

Parameters

device
str
Computation device.Options: cpu, cuda, mps, xpuDefault: Auto-detected

Features

  • Optimized for Japanese manga text
  • Handles vertical and horizontal text
  • Based on Vision Encoder-Decoder architecture
  • Removes spaces and normalizes output

Model Files

Downloads from HuggingFace: kha-white/manga-ocr-base

PaddleOCR

Multilingual OCR supporting 80+ languages.
from modules.ocr.ocr_paddle import PaddleOCRModule

ocr = PaddleOCRModule(
    language='English',
    device='cuda',
    ocr_version='PP-OCRv4'
)

Parameters

language
str
OCR language.Options: Chinese & English, English, French, German, Japanese, Korean, Chinese Traditional, Italian, Spanish, Portuguese, Russian, Ukrainian, Arabic, Hindi, and 80+ moreDefault: English
device
str
Computation device.Default: Auto-detected
use_angle_cls
bool
Enable angle classification for rotated text.Default: False
ocr_version
str
PaddleOCR model version.Options: PP-OCRv4, PP-OCRv3, PP-OCRv2, PP-OCRDefault: PP-OCRv4
enable_mkldnn
bool
Enable MKL-DNN for CPU acceleration.Default: False
det_limit_side_len
int
Maximum side length for text detection.Default: 960
rec_batch_num
int
Batch size for text recognition.Default: 6
drop_score
float
Confidence threshold for text recognition.Default: 0.5
text_case
str
Text case transformation.Options: Uppercase, Capitalize Sentences, LowercaseDefault: Capitalize Sentences
output_format
str
Text output format.Options: Single Line, As RecognizedDefault: As Recognized

Features

  • 80+ language support
  • Multiple OCR versions (v2-v4)
  • CPU and GPU acceleration
  • Text case normalization
  • Angle classification for rotated text

Example Usage

Basic OCR

import cv2
from modules.base import init_ocr_registries
from modules.ocr.base import OCR

# Initialize
init_ocr_registries()

# Load image
img = cv2.imread('page.jpg')
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

# Get OCR engine
ocr = OCR['manga_ocr'](device='cuda')

# Full image OCR
text = ocr.run_ocr(img)
print(f"Extracted text: {text}")

OCR with Text Blocks

from modules.textdetector.base import TEXTDETECTORS
from modules.ocr.base import OCR

# Detect text regions
detector = TEXTDETECTORS['ctd']()
mask, text_blocks = detector.detect(img)

# Run OCR on detected blocks
ocr = OCR['manga_ocr']()
text_blocks = ocr.run_ocr(img, text_blocks)

# Print results
for i, blk in enumerate(text_blocks):
    print(f"Block {i}: {blk.text}")

Multilingual OCR

# Japanese manga
ocr_ja = OCR['manga_ocr']()

# English text
ocr_en = OCR['paddle_ocr'](
    language='English',
    text_case='Capitalize Sentences'
)

# Korean text
ocr_ko = OCR['paddle_ocr'](
    language='Korean'
)

# Detect language and use appropriate OCR
def ocr_auto(img, text_blocks, lang='en'):
    ocr_map = {
        'ja': OCR['manga_ocr'](),
        'en': OCR['paddle_ocr'](language='English'),
        'ko': OCR['paddle_ocr'](language='Korean'),
    }
    ocr = ocr_map.get(lang, OCR['paddle_ocr']())
    return ocr.run_ocr(img, text_blocks)

Creating Custom OCR Engines

Basic Template

import numpy as np
from typing import List
from modules.ocr.base import OCRBase, register_OCR, TextBlock, DEVICE_SELECTOR

@register_OCR('my_ocr')
class MyOCR(OCRBase):
    
    params = {
        'device': DEVICE_SELECTOR(),
        'confidence': 0.5,
        'description': 'My custom OCR'
    }
    
    _load_model_keys = {'model'}
    
    def __init__(self, **params):
        super().__init__(**params)
        self.model = None
    
    def _load_model(self):
        """Load OCR model."""
        device = self.get_param_value('device')
        self.model = load_my_ocr_model(device=device)
    
    def ocr_img(self, img: np.ndarray) -> str:
        """OCR full image."""
        result = self.model.recognize(img)
        return result.text
    
    def _ocr_blk_list(self, img: np.ndarray, blk_list: List[TextBlock], *args, **kwargs):
        """OCR text blocks."""
        im_h, im_w = img.shape[:2]
        
        for blk in blk_list:
            x1, y1, x2, y2 = blk.xyxy
            
            # Validate coordinates
            if 0 <= x1 < x2 <= im_w and 0 <= y1 < y2 <= im_h:
                # Extract region
                region = img[y1:y2, x1:x2]
                
                try:
                    # Run OCR
                    result = self.model.recognize(region)
                    
                    # Filter by confidence
                    if result.confidence >= self.get_param_value('confidence'):
                        blk.text = result.text
                    else:
                        blk.text = ""
                        
                except Exception as e:
                    self.logger.error(f"OCR failed: {e}")
                    blk.text = ""
            else:
                self.logger.warning('Invalid text block coordinates')
                blk.text = ""

Advanced: With Postprocessing

@register_OCR('advanced_ocr')
class AdvancedOCR(OCRBase):
    
    def _ocr_blk_list(self, img: np.ndarray, blk_list: List[TextBlock], *args, **kwargs):
        """OCR with advanced postprocessing."""
        for blk in blk_list:
            x1, y1, x2, y2 = blk.xyxy
            region = img[y1:y2, x1:x2]
            
            # Run OCR
            raw_text = self.model.recognize(region)
            
            # Postprocess
            text = self.postprocess_text(raw_text)
            blk.text = text
    
    def postprocess_text(self, text: str) -> str:
        """Clean up OCR output."""
        import re
        
        # Remove extra spaces
        text = re.sub(r'\s+', ' ', text)
        
        # Remove hyphens at line breaks
        text = re.sub(r'-(?!\w)', '', text)
        
        # Capitalize sentences
        text = '. '.join(s.capitalize() for s in text.split('. '))
        
        return text.strip()

Postprocessing Hooks

Registering Hooks

def cleanup_hook(textblocks: List[TextBlock], img: np.ndarray, ocr_module):
    """Clean up OCR output."""
    for blk in textblocks:
        # Remove noise characters
        blk.text = blk.text.replace('~', '').replace('*', '')
        
        # Strip whitespace
        blk.text = blk.text.strip()

# Register globally for all instances
OCRBase.register_postprocess_hooks(cleanup_hook)

# Or register for specific OCR
MangaOCR.register_postprocess_hooks(cleanup_hook)

Best Practices

1. Language Selection

# Use specialized OCR for better results
ocr_manga = OCR['manga_ocr']()  # For Japanese manga
ocr_paddle_en = OCR['paddle_ocr'](language='English')  # For English

# Not: Using generic OCR for specialized text

2. Preprocessing Images

import cv2

def preprocess_for_ocr(img):
    """Enhance image for better OCR."""
    # Convert to grayscale
    gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
    
    # Increase contrast
    enhanced = cv2.equalizeHist(gray)
    
    # Denoise
    denoised = cv2.fastNlMeansDenoising(enhanced)
    
    # Convert back to RGB
    return cv2.cvtColor(denoised, cv2.COLOR_GRAY2RGB)

# Use preprocessed image
img_enhanced = preprocess_for_ocr(img)
text = ocr.run_ocr(img_enhanced, text_blocks)

3. Batch Processing

# Process multiple images efficiently
ocr = OCR['paddle_ocr'](language='English', rec_batch_num=8)

for img_path in image_paths:
    img = cv2.imread(img_path)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    
    # OCR
    text_blocks = ocr.run_ocr(img, text_blocks)
    
    # Save results
    save_results(text_blocks)

ocr.unload_model(empty_cache=True)

4. Error Handling

try:
    ocr = OCR['manga_ocr'](device='cuda')
    text = ocr.run_ocr(img, text_blocks)
except KeyError:
    print("OCR engine not found")
except RuntimeError as e:
    print(f"OCR failed: {e}")
    # Fallback to CPU
    ocr = OCR['manga_ocr'](device='cpu')
    text = ocr.run_ocr(img, text_blocks)

5. Parameter Tuning

# For better accuracy (PaddleOCR)
ocr = OCR['paddle_ocr'](
    language='English',
    det_limit_side_len=1280,  # Larger detection size
    drop_score=0.3,           # Lower threshold for more results
    rec_batch_num=4           # Smaller batch for stability
)

# For speed (PaddleOCR)
ocr = OCR['paddle_ocr'](
    language='English',
    det_limit_side_len=640,   # Smaller size
    drop_score=0.7,           # Higher threshold
    rec_batch_num=16,         # Larger batch
    enable_mkldnn=True        # CPU acceleration
)

Registry Usage

Listing Available OCR Engines

from modules.base import init_ocr_registries
from modules.ocr.base import OCR

init_ocr_registries()

print("Available OCR engines:")
for name, ocr_class in OCR.module_dict.items():
    print(f"  - {name}: {ocr_class}")

Dynamic OCR Selection

def get_ocr(language: str):
    """Get best OCR for language."""
    if language == 'ja':
        return OCR['manga_ocr']()
    else:
        return OCR['paddle_ocr'](language=language.capitalize())

# Usage
ocr = get_ocr('ja')