Overview
OCR (Optical Character Recognition) engines extract text from images or text blocks. BallonTranslator supports multiple OCR engines for different languages and use cases.
OCRBase Class
Base class for all OCR modules.
Import
from modules.ocr.base import OCRBase, OCR, register_OCR
Class Definition
class OCRBase(BaseModule):
"""
Base class for OCR modules.
Supports two modes:
1. Full image OCR: Returns text string
2. Block-based OCR: Extracts text from TextBlock regions
"""
_preprocess_hooks = OrderedDict()
_postprocess_hooks = OrderedDict()
_line_only = False # Whether OCR only handles single lines
Methods
run_ocr
Run OCR on an image or list of text blocks.Parameters:
img (np.ndarray): Input image (RGB or RGBA)
blk_list (List[TextBlock] | TextBlock | None): Text blocks to OCR
*args, **kwargs: Additional arguments
Returns:
- If
blk_list is None: Returns extracted text as string
- If
blk_list is provided: Returns list of TextBlocks with text attribute set
ocr = OCR['manga_ocr']()
# Full image OCR
text = ocr.run_ocr(img)
# Block-based OCR
blocks_with_text = ocr.run_ocr(img, text_blocks)
for blk in blocks_with_text:
print(f"Text: {blk.text}")
The run_ocr method automatically:
- Loads the model if not already loaded
- Converts RGBA images to RGB
- Runs postprocessing hooks
_ocr_blk_list (Override Required)
Internal method to OCR a list of text blocks. Must be implemented by subclasses.Parameters:
img (np.ndarray): Input image
blk_list (List[TextBlock]): Text blocks
*args, **kwargs: Additional arguments
Side Effects:
- Sets
blk.text attribute on each TextBlock
def _ocr_blk_list(self, img: np.ndarray, blk_list: List[TextBlock], *args, **kwargs):
im_h, im_w = img.shape[:2]
for blk in blk_list:
x1, y1, x2, y2 = blk.xyxy
if 0 <= x1 < x2 <= im_w and 0 <= y1 < y2 <= im_h:
region = img[y1:y2, x1:x2]
blk.text = self.recognize(region)
else:
blk.text = ""
ocr_img (Override Required)
OCR a full image. Must be implemented by subclasses.Parameters:
img (np.ndarray): Input image
Returns: str - Extracted textdef ocr_img(self, img: np.ndarray) -> str:
result = self.model(img)
return result.text
Properties
OCR engine name from registry. Automatically set during initialization.
Whether this OCR engine only handles single-line text.
Available OCR Engines
Manga OCR
Specialized OCR for Japanese manga text.
from modules.ocr.ocr_manga import MangaOCR
ocr = MangaOCR(device='cuda')
text = ocr.run_ocr(img, text_blocks)
Parameters
Computation device.Options: cpu, cuda, mps, xpuDefault: Auto-detected
Features
- Optimized for Japanese manga text
- Handles vertical and horizontal text
- Based on Vision Encoder-Decoder architecture
- Removes spaces and normalizes output
Model Files
Downloads from HuggingFace: kha-white/manga-ocr-base
PaddleOCR
Multilingual OCR supporting 80+ languages.
from modules.ocr.ocr_paddle import PaddleOCRModule
ocr = PaddleOCRModule(
language='English',
device='cuda',
ocr_version='PP-OCRv4'
)
Parameters
OCR language.Options: Chinese & English, English, French, German, Japanese, Korean, Chinese Traditional, Italian, Spanish, Portuguese, Russian, Ukrainian, Arabic, Hindi, and 80+ moreDefault: English
Computation device.Default: Auto-detected
Enable angle classification for rotated text.Default: False
PaddleOCR model version.Options: PP-OCRv4, PP-OCRv3, PP-OCRv2, PP-OCRDefault: PP-OCRv4
Enable MKL-DNN for CPU acceleration.Default: False
Maximum side length for text detection.Default: 960
Batch size for text recognition.Default: 6
Confidence threshold for text recognition.Default: 0.5
Text case transformation.Options: Uppercase, Capitalize Sentences, LowercaseDefault: Capitalize Sentences
Text output format.Options: Single Line, As RecognizedDefault: As Recognized
Features
- 80+ language support
- Multiple OCR versions (v2-v4)
- CPU and GPU acceleration
- Text case normalization
- Angle classification for rotated text
Example Usage
Basic OCR
import cv2
from modules.base import init_ocr_registries
from modules.ocr.base import OCR
# Initialize
init_ocr_registries()
# Load image
img = cv2.imread('page.jpg')
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
# Get OCR engine
ocr = OCR['manga_ocr'](device='cuda')
# Full image OCR
text = ocr.run_ocr(img)
print(f"Extracted text: {text}")
OCR with Text Blocks
from modules.textdetector.base import TEXTDETECTORS
from modules.ocr.base import OCR
# Detect text regions
detector = TEXTDETECTORS['ctd']()
mask, text_blocks = detector.detect(img)
# Run OCR on detected blocks
ocr = OCR['manga_ocr']()
text_blocks = ocr.run_ocr(img, text_blocks)
# Print results
for i, blk in enumerate(text_blocks):
print(f"Block {i}: {blk.text}")
Multilingual OCR
# Japanese manga
ocr_ja = OCR['manga_ocr']()
# English text
ocr_en = OCR['paddle_ocr'](
language='English',
text_case='Capitalize Sentences'
)
# Korean text
ocr_ko = OCR['paddle_ocr'](
language='Korean'
)
# Detect language and use appropriate OCR
def ocr_auto(img, text_blocks, lang='en'):
ocr_map = {
'ja': OCR['manga_ocr'](),
'en': OCR['paddle_ocr'](language='English'),
'ko': OCR['paddle_ocr'](language='Korean'),
}
ocr = ocr_map.get(lang, OCR['paddle_ocr']())
return ocr.run_ocr(img, text_blocks)
Creating Custom OCR Engines
Basic Template
import numpy as np
from typing import List
from modules.ocr.base import OCRBase, register_OCR, TextBlock, DEVICE_SELECTOR
@register_OCR('my_ocr')
class MyOCR(OCRBase):
params = {
'device': DEVICE_SELECTOR(),
'confidence': 0.5,
'description': 'My custom OCR'
}
_load_model_keys = {'model'}
def __init__(self, **params):
super().__init__(**params)
self.model = None
def _load_model(self):
"""Load OCR model."""
device = self.get_param_value('device')
self.model = load_my_ocr_model(device=device)
def ocr_img(self, img: np.ndarray) -> str:
"""OCR full image."""
result = self.model.recognize(img)
return result.text
def _ocr_blk_list(self, img: np.ndarray, blk_list: List[TextBlock], *args, **kwargs):
"""OCR text blocks."""
im_h, im_w = img.shape[:2]
for blk in blk_list:
x1, y1, x2, y2 = blk.xyxy
# Validate coordinates
if 0 <= x1 < x2 <= im_w and 0 <= y1 < y2 <= im_h:
# Extract region
region = img[y1:y2, x1:x2]
try:
# Run OCR
result = self.model.recognize(region)
# Filter by confidence
if result.confidence >= self.get_param_value('confidence'):
blk.text = result.text
else:
blk.text = ""
except Exception as e:
self.logger.error(f"OCR failed: {e}")
blk.text = ""
else:
self.logger.warning('Invalid text block coordinates')
blk.text = ""
Advanced: With Postprocessing
@register_OCR('advanced_ocr')
class AdvancedOCR(OCRBase):
def _ocr_blk_list(self, img: np.ndarray, blk_list: List[TextBlock], *args, **kwargs):
"""OCR with advanced postprocessing."""
for blk in blk_list:
x1, y1, x2, y2 = blk.xyxy
region = img[y1:y2, x1:x2]
# Run OCR
raw_text = self.model.recognize(region)
# Postprocess
text = self.postprocess_text(raw_text)
blk.text = text
def postprocess_text(self, text: str) -> str:
"""Clean up OCR output."""
import re
# Remove extra spaces
text = re.sub(r'\s+', ' ', text)
# Remove hyphens at line breaks
text = re.sub(r'-(?!\w)', '', text)
# Capitalize sentences
text = '. '.join(s.capitalize() for s in text.split('. '))
return text.strip()
Postprocessing Hooks
Registering Hooks
def cleanup_hook(textblocks: List[TextBlock], img: np.ndarray, ocr_module):
"""Clean up OCR output."""
for blk in textblocks:
# Remove noise characters
blk.text = blk.text.replace('~', '').replace('*', '')
# Strip whitespace
blk.text = blk.text.strip()
# Register globally for all instances
OCRBase.register_postprocess_hooks(cleanup_hook)
# Or register for specific OCR
MangaOCR.register_postprocess_hooks(cleanup_hook)
Best Practices
1. Language Selection
# Use specialized OCR for better results
ocr_manga = OCR['manga_ocr']() # For Japanese manga
ocr_paddle_en = OCR['paddle_ocr'](language='English') # For English
# Not: Using generic OCR for specialized text
2. Preprocessing Images
import cv2
def preprocess_for_ocr(img):
"""Enhance image for better OCR."""
# Convert to grayscale
gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
# Increase contrast
enhanced = cv2.equalizeHist(gray)
# Denoise
denoised = cv2.fastNlMeansDenoising(enhanced)
# Convert back to RGB
return cv2.cvtColor(denoised, cv2.COLOR_GRAY2RGB)
# Use preprocessed image
img_enhanced = preprocess_for_ocr(img)
text = ocr.run_ocr(img_enhanced, text_blocks)
3. Batch Processing
# Process multiple images efficiently
ocr = OCR['paddle_ocr'](language='English', rec_batch_num=8)
for img_path in image_paths:
img = cv2.imread(img_path)
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
# OCR
text_blocks = ocr.run_ocr(img, text_blocks)
# Save results
save_results(text_blocks)
ocr.unload_model(empty_cache=True)
4. Error Handling
try:
ocr = OCR['manga_ocr'](device='cuda')
text = ocr.run_ocr(img, text_blocks)
except KeyError:
print("OCR engine not found")
except RuntimeError as e:
print(f"OCR failed: {e}")
# Fallback to CPU
ocr = OCR['manga_ocr'](device='cpu')
text = ocr.run_ocr(img, text_blocks)
5. Parameter Tuning
# For better accuracy (PaddleOCR)
ocr = OCR['paddle_ocr'](
language='English',
det_limit_side_len=1280, # Larger detection size
drop_score=0.3, # Lower threshold for more results
rec_batch_num=4 # Smaller batch for stability
)
# For speed (PaddleOCR)
ocr = OCR['paddle_ocr'](
language='English',
det_limit_side_len=640, # Smaller size
drop_score=0.7, # Higher threshold
rec_batch_num=16, # Larger batch
enable_mkldnn=True # CPU acceleration
)
Registry Usage
Listing Available OCR Engines
from modules.base import init_ocr_registries
from modules.ocr.base import OCR
init_ocr_registries()
print("Available OCR engines:")
for name, ocr_class in OCR.module_dict.items():
print(f" - {name}: {ocr_class}")
Dynamic OCR Selection
def get_ocr(language: str):
"""Get best OCR for language."""
if language == 'ja':
return OCR['manga_ocr']()
else:
return OCR['paddle_ocr'](language=language.capitalize())
# Usage
ocr = get_ocr('ja')