//! OCR via libtesseract (dynamic link, gated by the `ocr` cargo feature). //! //! `leptess` wraps `tesseract-sys` (bindgen-generated bindings to libtesseract + //! libleptonica). At build time bindgen reads /usr/include/tesseract/*.h and emits //! the Rust extern declarations; at runtime we dynamic-link against //! `libtesseract.so` / `libleptonica.so` already present on the system. use image::DynamicImage; use leptess::LepTess; use crate::error::{BlastError, Result}; use crate::shhh::encode_png; /// Run OCR on `img` using the given language tag (e.g. "eng", "eng+deu"). /// Returns the recognized text with trailing whitespace trimmed. /// /// leptess only exposes `set_image_from_mem` (decodes via leptonica), so we /// round-trip through PNG. Re-encoding is cheap relative to OCR runtime. pub fn recognize(img: &DynamicImage, language: &str) -> Result { let mut lt = LepTess::new(None, language) .map_err(|e| BlastError::Other(format!("tesseract init: {e}")))?; let png = encode_png(img).map_err(|e| BlastError::Image(e.to_string()))?; lt.set_image_from_mem(&png) .map_err(|e| BlastError::Other(format!("tesseract set_image: {e}")))?; let text = lt .get_utf8_text() .map_err(|e| BlastError::Other(format!("tesseract recognize: {e}")))?; Ok(text.trim().to_string()) }