Skip to content

ScanOcr

Extracts optical text from image files and creates a thumbnail.

This scanner extracts text from image files using OCR (Optical Character Recognition) and generates a base64-encoded thumbnail. It supports direct image files and converting PDFs to images for OCR.

Options

extract_text: If True, extracted text is emitted as a child file. (default: False) split_words: If True, splits the OCR text into words and stores an array. (default: True) remove_formatting: If True, removes formatting characters (e.g., ). Overridden by split_words. (default: True) tmp_directory: Directory for temporary files. (default: '/tmp/') pdf_to_png: If True, converts PDFs to PNG for OCR. (default: False) create_thumbnail: If True, creates a thumbnail for the image. (default: False) thumbnail_size: Size of the thumbnail to create. (default: (250, 250))

Source code in strelka/src/python/strelka/scanners/scan_ocr.py
class ScanOcr(strelka.Scanner):
    """Extracts optical text from image files and creates a thumbnail.

    This scanner extracts text from image files using OCR (Optical Character Recognition) and
    generates a base64-encoded thumbnail. It supports direct image files and converting PDFs
    to images for OCR.

    Options:
        extract_text: If True, extracted text is emitted as a child file. (default: False)
        split_words: If True, splits the OCR text into words and stores an array. (default: True)
        remove_formatting: If True, removes formatting characters (e.g., \r). Overridden by split_words. (default: True)
        tmp_directory: Directory for temporary files. (default: '/tmp/')
        pdf_to_png: If True, converts PDFs to PNG for OCR. (default: False)
        create_thumbnail: If True, creates a thumbnail for the image. (default: False)
        thumbnail_size: Size of the thumbnail to create. (default: (250, 250))
    """

    def scan(self, data, file, options, expire_at):
        extract_text = options.get("extract_text", False)
        remove_formatting = options.get("remove_formatting", True)
        tmp_directory = options.get("tmp_directory", "/tmp/")
        pdf_to_png = options.get("pdf_to_png", False)
        create_thumbnail = options.get("create_thumbnail", False)
        thumbnail_size = options.get("thumbnail_size", (250, 250))

        # Convert PDF to PNG if required.
        if pdf_to_png and "application/pdf" in file.flavors.get("mime", []):
            try:
                reader = fitz.open(stream=data, filetype="pdf")
                if reader.is_encrypted:
                    return
                data = reader.get_page_pixmap(0).tobytes("png")
            except Exception as e:
                self.flags.append(
                    f"{self.__class__.__name__}: image_pdf_error: {str(e)[:50]}"
                )

        # Create a thumbnail from the image.
        # Stores as a base64 value in the key: base64_thumbnail
        if create_thumbnail:
            try:
                image = Image.open(io.BytesIO(data))
                image.thumbnail(thumbnail_size, Image.Resampling.BILINEAR)
                buffered = io.BytesIO()
                image.save(buffered, format="WEBP", quality=70, optimize=True)
                base64_image = base64.b64encode(buffered.getvalue()).decode("utf-8")
                self.event["base64_thumbnail"] = base64_image
            except Exception as e:
                self.flags.append(
                    f"{self.__class__.__name__}: image_thumbnail_error: {str(e)[:50]}"
                )
        # Perform OCR on the image data.
        with tempfile.NamedTemporaryFile(dir=tmp_directory) as tmp_data:
            tmp_data.write(data)
            tmp_data.flush()

            with tempfile.NamedTemporaryFile(dir=tmp_directory) as tmp_tess:
                try:
                    tess_txt_name = f"{tmp_tess.name}.txt"
                    subprocess.run(
                        ["tesseract", tmp_data.name, tmp_tess.name],
                        capture_output=True,
                        check=True,
                    )

                    with open(tess_txt_name, "rb") as tess_txt:
                        ocr_file = tess_txt.read()
                        if ocr_file:
                            self.event["text"] = ocr_file.split()
                            if remove_formatting:
                                self.event["string_text"] = (
                                    ocr_file.replace(b"\r", b"")
                                    .replace(b"\n", b"")
                                    .replace(b"\f", b"")
                                )
                            else:
                                self.event["string_text"] = ocr_file
                        if extract_text:
                            # Send extracted file back to Strelka
                            self.emit_file(ocr_file, name="text")

                    os.remove(tess_txt_name)

                except subprocess.CalledProcessError as e:
                    self.flags.append(
                        f"{self.__class__.__name__}: tesseract_process_error: {str(e)[:50]}"
                    )
                    raise strelka.ScannerException(e.stderr)

Features

The features of this scanner are detailed below. These features represent the capabilities and the type of analysis the scanner can perform. This may include support for Indicators of Compromise (IOC), the ability to emit files for further analysis, and the presence of extended documentation for complex analysis techniques.

Feature
Support
IOC Support
Emit Files
Extended Docs
Malware Scanner
Image Thumbnails

Tastes

Strelka's file distribution system assigns scanners to files based on 'flavors' and 'tastes'. Flavors describe the type of file, typically determined by MIME types from libmagic, matches from YARA rules, or characteristics of parent files. Tastes are the criteria used within Strelka to determine which scanners are applied to which files, with positive and negative tastes defining files to be included or excluded respectively.

Source Filetype
Include / Exclude
application/pdf
bmp_file
gif_file
image/gif
image/jpeg
image/png
image/tiff
image/webp
image/x-ms-bmp
jpeg_file
pdf_file
png_file
type_is_tiff

Scanner Fields

This section provides a list of fields that are extracted from the files processed by this scanner. These fields include the data elements that the scanner extracts from each file, representing the analytical results produced by the scanner. If the test file is missing or cannot be parsed, this section will not contain any data.

Field Name
Field Type
base64_thumbnail
str
elapsed
str
flags
list
string_text
bytes
text
list

Sample Event

Below is a sample event generated by this scanner, demonstrating the kind of output that can be expected when it processes a file. This sample is derived from a mock scan event configured in the scanner's test file. If no test file is available, this section will not display a sample event.

    test_scan_event = {
        "elapsed": 0.001,
        "flags": [],
        "string_text": b"Lorem Ipsum Lorem ipsum dolor sit amet, consectetur adipisci"
        b"ng elit. Cras lobortis sem dui. Morbi at magna quis ligula f"
        b"aucibusconsectetur feugiat at purus. Sed nec lorem nibh. Nam"
        b" vel libero odio. Vivamus tempus non enim egestas pretium.Ve"
        b"stibulum turpis arcu, maximus nec libero quis, imperdiet sus"
        b"cipit purus. Vestibulum blandit quis lacus nonsollicitudin. "
        b"Nullam non convallis dui, et aliquet risus. Sed accumsan ull"
        b"amcorper vehicula. Proin non urna facilisis,condimentum eros"
        b" quis, suscipit purus. Morbi euismod imperdiet neque ferment"
        b"um dictum. Integer aliquam, erat sitamet fringilla tempus, m"
        b"auris ligula blandit sapien, et varius sem mauris eu diam. S"
        b"ed fringilla neque est, in laoreetfelis tristique in. Donec "
        b"luctus velit a posuere posuere. Suspendisse sodales pellente"
        b"sque quam.",
        "text": [
            b"Lorem",
            b"Ipsum",
            b"Lorem",
            b"ipsum",
            b"dolor",
            b"sit",
            b"amet,",
            b"consectetur",
            b"adipiscing",
            b"elit.",
            b"Cras",
            b"lobortis",
            b"sem",
            b"dui.",
            b"Morbi",
            b"at",
            b"magna",
            b"quis",
            b"ligula",
            b"faucibus",
            b"consectetur",
            b"feugiat",
            b"at",
            b"purus.",
            b"Sed",
            b"nec",
            b"lorem",
            b"nibh.",
            b"Nam",
            b"vel",
            b"libero",
            b"odio.",
            b"Vivamus",
            b"tempus",
            b"non",
            b"enim",
            b"egestas",
            b"pretium.",
            b"Vestibulum",
            b"turpis",
            b"arcu,",
            b"maximus",
            b"nec",
            b"libero",
            b"quis,",
            b"imperdiet",
            b"suscipit",
            b"purus.",
            b"Vestibulum",
            b"blandit",
            b"quis",
            b"lacus",
            b"non",
            b"sollicitudin.",
            b"Nullam",
            b"non",
            b"convallis",
            b"dui,",
            b"et",
            b"aliquet",
            b"risus.",
            b"Sed",
            b"accumsan",
            b"ullamcorper",
            b"vehicula.",
            b"Proin",
            b"non",
            b"urna",
            b"facilisis,",
            b"condimentum",
            b"eros",
            b"quis,",
            b"suscipit",
            b"purus.",
            b"Morbi",
            b"euismod",
            b"imperdiet",
            b"neque",
            b"fermentum",
            b"dictum.",
            b"Integer",
            b"aliquam,",
            b"erat",
            b"sit",
            b"amet",
            b"fringilla",
            b"tempus,",
            b"mauris",
            b"ligula",
            b"blandit",
            b"sapien,",
            b"et",
            b"varius",
            b"sem",
            b"mauris",
            b"eu",
            b"diam.",
            b"Sed",
            b"fringilla",
            b"neque",
            b"est,",
            b"in",
            b"laoreet",
            b"felis",
            b"tristique",
            b"in.",
            b"Donec",
            b"luctus",
            b"velit",
            b"a",
            b"posuere",
            b"posuere.",
            b"Suspendisse",
            b"sodales",
            b"pellentesque",
            b"quam.",
        ],
    }