Overview
Text detectors identify text regions in images and return bounding boxes with text blocks. All detectors inherit from TextDetectorBase and are managed through the TEXTDETECTORS registry.
TextDetectorBase Class
Base class for all text detection modules.
Import
from modules.textdetector.base import TextDetectorBase, TEXTDETECTORS, register_textdetectors
Class Definition
class TextDetectorBase(BaseModule):
"""
Base class for text detection modules.
Detects text regions in images and returns:
- Binary mask of text regions
- List of TextBlock objects with bounding boxes
"""
_preprocess_hooks = OrderedDict()
_postprocess_hooks = OrderedDict()
Methods
detect
Detect text in an image.Parameters:
img (np.ndarray): Input image (RGB or RGBA)
proj (ProjImgTrans | None): Projection context (optional)
Returns:
mask (np.ndarray): Binary mask of detected text regions
blk_list (List[TextBlock]): List of detected text blocks
detector = TEXTDETECTORS['ctd']()
mask, text_blocks = detector.detect(img)
for blk in text_blocks:
x1, y1, x2, y2 = blk.xyxy
print(f"Text at ({x1}, {y1}) to ({x2}, {y2})")
The detect method automatically:
- Loads the model if not already loaded
- Converts RGBA images to RGB
- Sets the
det_model attribute on each TextBlock
_detect (Override Required)
Internal detection method. Must be implemented by subclasses.Parameters:
img (np.ndarray): Input image (RGB)
proj (ProjImgTrans): Projection context
Returns:
mask (np.ndarray): Binary mask
blk_list (List[TextBlock]): Text blocks
def _detect(self, img: np.ndarray, proj: ProjImgTrans) -> Tuple[np.ndarray, List[TextBlock]]:
# Your detection logic
mask = np.zeros(img.shape[:2], dtype=np.uint8)
blk_list = []
# ... detection code ...
return mask, blk_list
setup_detector (Override Required)
Initialize the detector. Override this for detector-specific setup.def setup_detector(self):
# Initialize detector components
pass
Properties
Detector name from registry. Automatically set during initialization.
TextBlock Structure
Detected text blocks are represented by TextBlock objects.
from utils.textblock import TextBlock
# TextBlock attributes (set by detector):
blk.xyxy # [x1, y1, x2, y2] bounding box
blk.det_model # Name of detector that found this block
blk.font_size # Estimated font size (if available)
blk._detected_font_size # Original detected font size
Available Detectors
Comic Text Detector (ctd)
High-performance detector optimized for manga/comics.
from modules.textdetector.detector_ctd import ComicTextDetector
detector = ComicTextDetector(
detect_size=1280,
device='cuda'
)
mask, blocks = detector.detect(image)
Parameters
Maximum detection size. Larger values improve accuracy but use more memory.Options: 896, 1024, 1152, 1280Default: 1280
det_rearrange_max_batches
Maximum batches for detection rearrangement.Options: 1, 2, 4, 6, 8, 12, 16, 24, 32Default: 4
Computation device.Options: cpu, cuda, mps, xpuDefault: Auto-detected
Multiplier for detected font sizes.Default: 1.0
Maximum font size limit (-1 for no limit).Default: -1
Minimum font size limit (-1 for no limit).Default: -1
Kernel size for mask dilation.Default: 2
Model Files
Automatically downloads from:
comictextdetector.pt (PyTorch, for GPU)
comictextdetector.pt.onnx (ONNX, for CPU)
Example: Using CTD
import cv2
from modules.base import init_textdetector_registries
from modules.textdetector.base import TEXTDETECTORS
# Initialize registries
init_textdetector_registries()
# Get detector
detector = TEXTDETECTORS['ctd'](
detect_size=1280,
device='cuda',
**{'font size multiplier': 1.2}
)
# Load image
img = cv2.imread('manga_page.jpg')
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
# Detect text
mask, text_blocks = detector.detect(img)
print(f"Found {len(text_blocks)} text regions")
for i, blk in enumerate(text_blocks):
x1, y1, x2, y2 = blk.xyxy
print(f"Block {i}: ({x1}, {y1}) to ({x2}, {y2})")
print(f" Font size: {blk.font_size}")
print(f" Detector: {blk.det_model}")
# Visualize mask
cv2.imshow('Text Mask', mask)
cv2.waitKey(0)
Creating Custom Detectors
Basic Template
import numpy as np
from typing import Tuple, List
from modules.textdetector.base import (
TextDetectorBase,
register_textdetectors,
TextBlock,
DEVICE_SELECTOR
)
@register_textdetectors('my_detector')
class MyDetector(TextDetectorBase):
params = {
'device': DEVICE_SELECTOR(),
'threshold': 0.5,
'description': 'My custom detector'
}
_load_model_keys = {'model'}
def __init__(self, **params):
super().__init__(**params)
self.model = None
def setup_detector(self):
"""Initialize detector components."""
self.threshold = self.get_param_value('threshold')
def _load_model(self):
"""Load detection model."""
# Load your model
self.model = load_my_model()
def _detect(self, img: np.ndarray, proj) -> Tuple[np.ndarray, List[TextBlock]]:
"""Detect text regions."""
# Run detection
predictions = self.model.predict(img)
# Create mask
mask = np.zeros(img.shape[:2], dtype=np.uint8)
# Create text blocks
blk_list = []
for pred in predictions:
if pred.score > self.threshold:
blk = TextBlock()
blk.xyxy = pred.bbox # [x1, y1, x2, y2]
blk.font_size = pred.font_size
blk_list.append(blk)
# Update mask
x1, y1, x2, y2 = map(int, pred.bbox)
mask[y1:y2, x1:x2] = 255
return mask, blk_list
Advanced: With Font Size Detection
from modules.textdetector.detector_ctd import ComicTextDetector
@register_textdetectors('advanced_detector')
class AdvancedDetector(TextDetectorBase):
params = {
'device': DEVICE_SELECTOR(),
'font size multiplier': 1.0,
'font size max': -1,
'font size min': -1,
}
def _detect(self, img: np.ndarray, proj) -> Tuple[np.ndarray, List[TextBlock]]:
# Detection logic
mask, blk_list = self.run_detection(img)
# Apply font size adjustments
fnt_rsz = self.get_param_value('font size multiplier')
fnt_max = self.get_param_value('font size max')
fnt_min = self.get_param_value('font size min')
for blk in blk_list:
sz = blk._detected_font_size * fnt_rsz
if fnt_max > 0:
sz = min(fnt_max, sz)
if fnt_min > 0:
sz = max(fnt_min, sz)
blk.font_size = sz
blk._detected_font_size = sz
return mask, blk_list
Registry Usage
Listing Available Detectors
from modules.base import init_textdetector_registries
from modules.textdetector.base import TEXTDETECTORS
# Initialize
init_textdetector_registries()
# List all detectors
print("Available detectors:")
for name, detector_class in TEXTDETECTORS.module_dict.items():
print(f" - {name}: {detector_class}")
Dynamic Detector Loading
def get_detector(detector_name: str, **params):
"""Get detector by name."""
if detector_name not in TEXTDETECTORS:
raise ValueError(f"Unknown detector: {detector_name}")
detector_class = TEXTDETECTORS[detector_name]
return detector_class(**params)
# Usage
detector = get_detector('ctd', device='cuda', detect_size=1280)
Best Practices
1. Memory Management
# Load model only when needed
detector = TEXTDETECTORS['ctd']()
# Model is loaded automatically on first detect() call
# Unload when done
detector.unload_model(empty_cache=True)
2. Batch Processing
import glob
detector = TEXTDETECTORS['ctd'](device='cuda')
for img_path in glob.glob('images/*.jpg'):
img = cv2.imread(img_path)
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
mask, blocks = detector.detect(img)
# Process blocks
process_blocks(blocks)
# Clean up
detector.unload_model(empty_cache=True)
3. Device Management
# Auto-detect best device
from modules.base import DEFAULT_DEVICE
detector = TEXTDETECTORS['ctd'](device=DEFAULT_DEVICE)
# Or specify manually
detector = TEXTDETECTORS['ctd'](device='cuda')
# Change device at runtime
detector.updateParam('device', 'cpu')
4. Parameter Tuning
# Start with defaults
detector = TEXTDETECTORS['ctd']()
# Adjust for better accuracy (larger images)
detector.updateParam('detect_size', 1280)
# Adjust font size detection
detector.set_param_value('font size multiplier', 1.2)
detector.set_param_value('font size min', 12)
# Larger mask coverage
detector.set_param_value('mask dilate size', 3)
Error Handling
try:
detector = TEXTDETECTORS['ctd'](device='cuda')
mask, blocks = detector.detect(img)
except KeyError:
print("Detector not found. Did you initialize registries?")
except RuntimeError as e:
print(f"Detection failed: {e}")
# Fall back to CPU
detector = TEXTDETECTORS['ctd'](device='cpu')
mask, blocks = detector.detect(img)