Skip to main content

Overview

Text detectors identify text regions in images and return bounding boxes with text blocks. All detectors inherit from TextDetectorBase and are managed through the TEXTDETECTORS registry.

TextDetectorBase Class

Base class for all text detection modules.

Import

from modules.textdetector.base import TextDetectorBase, TEXTDETECTORS, register_textdetectors

Class Definition

class TextDetectorBase(BaseModule):
    """
    Base class for text detection modules.
    
    Detects text regions in images and returns:
    - Binary mask of text regions
    - List of TextBlock objects with bounding boxes
    """
    
    _preprocess_hooks = OrderedDict()
    _postprocess_hooks = OrderedDict()

Methods

detect

detect
method
Detect text in an image.Parameters:
  • img (np.ndarray): Input image (RGB or RGBA)
  • proj (ProjImgTrans | None): Projection context (optional)
Returns:
  • mask (np.ndarray): Binary mask of detected text regions
  • blk_list (List[TextBlock]): List of detected text blocks
detector = TEXTDETECTORS['ctd']()
mask, text_blocks = detector.detect(img)

for blk in text_blocks:
    x1, y1, x2, y2 = blk.xyxy
    print(f"Text at ({x1}, {y1}) to ({x2}, {y2})")
The detect method automatically:
  • Loads the model if not already loaded
  • Converts RGBA images to RGB
  • Sets the det_model attribute on each TextBlock

_detect (Override Required)

_detect
method
Internal detection method. Must be implemented by subclasses.Parameters:
  • img (np.ndarray): Input image (RGB)
  • proj (ProjImgTrans): Projection context
Returns:
  • mask (np.ndarray): Binary mask
  • blk_list (List[TextBlock]): Text blocks
def _detect(self, img: np.ndarray, proj: ProjImgTrans) -> Tuple[np.ndarray, List[TextBlock]]:
    # Your detection logic
    mask = np.zeros(img.shape[:2], dtype=np.uint8)
    blk_list = []
    
    # ... detection code ...
    
    return mask, blk_list

setup_detector (Override Required)

setup_detector
method
Initialize the detector. Override this for detector-specific setup.
def setup_detector(self):
    # Initialize detector components
    pass

Properties

name
str
Detector name from registry. Automatically set during initialization.

TextBlock Structure

Detected text blocks are represented by TextBlock objects.
from utils.textblock import TextBlock

# TextBlock attributes (set by detector):
blk.xyxy         # [x1, y1, x2, y2] bounding box
blk.det_model    # Name of detector that found this block
blk.font_size    # Estimated font size (if available)
blk._detected_font_size  # Original detected font size

Available Detectors

Comic Text Detector (ctd)

High-performance detector optimized for manga/comics.
from modules.textdetector.detector_ctd import ComicTextDetector

detector = ComicTextDetector(
    detect_size=1280,
    device='cuda'
)

mask, blocks = detector.detect(image)

Parameters

detect_size
int
Maximum detection size. Larger values improve accuracy but use more memory.Options: 896, 1024, 1152, 1280Default: 1280
det_rearrange_max_batches
int
Maximum batches for detection rearrangement.Options: 1, 2, 4, 6, 8, 12, 16, 24, 32Default: 4
device
str
Computation device.Options: cpu, cuda, mps, xpuDefault: Auto-detected
font size multiplier
float
Multiplier for detected font sizes.Default: 1.0
font size max
int
Maximum font size limit (-1 for no limit).Default: -1
font size min
int
Minimum font size limit (-1 for no limit).Default: -1
mask dilate size
int
Kernel size for mask dilation.Default: 2

Model Files

Automatically downloads from:
  • comictextdetector.pt (PyTorch, for GPU)
  • comictextdetector.pt.onnx (ONNX, for CPU)

Example: Using CTD

import cv2
from modules.base import init_textdetector_registries
from modules.textdetector.base import TEXTDETECTORS

# Initialize registries
init_textdetector_registries()

# Get detector
detector = TEXTDETECTORS['ctd'](
    detect_size=1280,
    device='cuda',
    **{'font size multiplier': 1.2}
)

# Load image
img = cv2.imread('manga_page.jpg')
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

# Detect text
mask, text_blocks = detector.detect(img)

print(f"Found {len(text_blocks)} text regions")

for i, blk in enumerate(text_blocks):
    x1, y1, x2, y2 = blk.xyxy
    print(f"Block {i}: ({x1}, {y1}) to ({x2}, {y2})")
    print(f"  Font size: {blk.font_size}")
    print(f"  Detector: {blk.det_model}")

# Visualize mask
cv2.imshow('Text Mask', mask)
cv2.waitKey(0)

Creating Custom Detectors

Basic Template

import numpy as np
from typing import Tuple, List
from modules.textdetector.base import (
    TextDetectorBase, 
    register_textdetectors,
    TextBlock,
    DEVICE_SELECTOR
)

@register_textdetectors('my_detector')
class MyDetector(TextDetectorBase):
    
    params = {
        'device': DEVICE_SELECTOR(),
        'threshold': 0.5,
        'description': 'My custom detector'
    }
    
    _load_model_keys = {'model'}
    
    def __init__(self, **params):
        super().__init__(**params)
        self.model = None
    
    def setup_detector(self):
        """Initialize detector components."""
        self.threshold = self.get_param_value('threshold')
    
    def _load_model(self):
        """Load detection model."""
        # Load your model
        self.model = load_my_model()
    
    def _detect(self, img: np.ndarray, proj) -> Tuple[np.ndarray, List[TextBlock]]:
        """Detect text regions."""
        # Run detection
        predictions = self.model.predict(img)
        
        # Create mask
        mask = np.zeros(img.shape[:2], dtype=np.uint8)
        
        # Create text blocks
        blk_list = []
        for pred in predictions:
            if pred.score > self.threshold:
                blk = TextBlock()
                blk.xyxy = pred.bbox  # [x1, y1, x2, y2]
                blk.font_size = pred.font_size
                blk_list.append(blk)
                
                # Update mask
                x1, y1, x2, y2 = map(int, pred.bbox)
                mask[y1:y2, x1:x2] = 255
        
        return mask, blk_list

Advanced: With Font Size Detection

from modules.textdetector.detector_ctd import ComicTextDetector

@register_textdetectors('advanced_detector')
class AdvancedDetector(TextDetectorBase):
    
    params = {
        'device': DEVICE_SELECTOR(),
        'font size multiplier': 1.0,
        'font size max': -1,
        'font size min': -1,
    }
    
    def _detect(self, img: np.ndarray, proj) -> Tuple[np.ndarray, List[TextBlock]]:
        # Detection logic
        mask, blk_list = self.run_detection(img)
        
        # Apply font size adjustments
        fnt_rsz = self.get_param_value('font size multiplier')
        fnt_max = self.get_param_value('font size max')
        fnt_min = self.get_param_value('font size min')
        
        for blk in blk_list:
            sz = blk._detected_font_size * fnt_rsz
            if fnt_max > 0:
                sz = min(fnt_max, sz)
            if fnt_min > 0:
                sz = max(fnt_min, sz)
            blk.font_size = sz
            blk._detected_font_size = sz
        
        return mask, blk_list

Registry Usage

Listing Available Detectors

from modules.base import init_textdetector_registries
from modules.textdetector.base import TEXTDETECTORS

# Initialize
init_textdetector_registries()

# List all detectors
print("Available detectors:")
for name, detector_class in TEXTDETECTORS.module_dict.items():
    print(f"  - {name}: {detector_class}")

Dynamic Detector Loading

def get_detector(detector_name: str, **params):
    """Get detector by name."""
    if detector_name not in TEXTDETECTORS:
        raise ValueError(f"Unknown detector: {detector_name}")
    
    detector_class = TEXTDETECTORS[detector_name]
    return detector_class(**params)

# Usage
detector = get_detector('ctd', device='cuda', detect_size=1280)

Best Practices

1. Memory Management

# Load model only when needed
detector = TEXTDETECTORS['ctd']()
# Model is loaded automatically on first detect() call

# Unload when done
detector.unload_model(empty_cache=True)

2. Batch Processing

import glob

detector = TEXTDETECTORS['ctd'](device='cuda')

for img_path in glob.glob('images/*.jpg'):
    img = cv2.imread(img_path)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    
    mask, blocks = detector.detect(img)
    
    # Process blocks
    process_blocks(blocks)

# Clean up
detector.unload_model(empty_cache=True)

3. Device Management

# Auto-detect best device
from modules.base import DEFAULT_DEVICE

detector = TEXTDETECTORS['ctd'](device=DEFAULT_DEVICE)

# Or specify manually
detector = TEXTDETECTORS['ctd'](device='cuda')

# Change device at runtime
detector.updateParam('device', 'cpu')

4. Parameter Tuning

# Start with defaults
detector = TEXTDETECTORS['ctd']()

# Adjust for better accuracy (larger images)
detector.updateParam('detect_size', 1280)

# Adjust font size detection
detector.set_param_value('font size multiplier', 1.2)
detector.set_param_value('font size min', 12)

# Larger mask coverage
detector.set_param_value('mask dilate size', 3)

Error Handling

try:
    detector = TEXTDETECTORS['ctd'](device='cuda')
    mask, blocks = detector.detect(img)
except KeyError:
    print("Detector not found. Did you initialize registries?")
except RuntimeError as e:
    print(f"Detection failed: {e}")
    # Fall back to CPU
    detector = TEXTDETECTORS['ctd'](device='cpu')
    mask, blocks = detector.detect(img)