From Pixels to Privacy Using OCR Microsoft Presidio to Redact Sensitive Data in ID Cards

Table of Contents

Appendix: Code Snippets
Text Analysis with Presidio
OCR Extraction with Tesseract
OCR + Presidio Pipeline
Visual Redaction with Bounding Boxes

Appendix: Code Snippets

Text Analysis with Presidio

import re import cv2 import pytesseract from presidio_analyzer import AnalyzerEngine, Pattern, PatternRecognizer from presidio_anonymizer import AnonymizerEngine from PIL import Image

text = pytesseract.image_to_string(cv2.imread("ktp_sample.jpg")) analyzer = AnalyzerEngine() analyzer.registry.add_recognizer(PatternRecognizer("INDONESIAN_NIK", [Pattern("NIK", r"bd{16}b", 0.9)])) results = analyzer.analyze(text=text, entities=["INDONESIAN_NIK"], language="en") anonymized = AnonymizerEngine().anonymize(text=text, analyzer_results=results) print(anonymized.text)

OCR Extraction with Tesseract

import cv2, pytesseract from presidio_analyzer import AnalyzerEngine, Pattern, PatternRecognizer from pytesseract import Output pytesseract.pytesseract.tesseract_cmd = r"C:Program FilesTesseract-OCRtesseract.exe" image = cv2.imread("ktp_sample.jpeg") ocr_data = pytesseract.image_to_data(image, output_type=Output.DICT) text = " ".join(ocr_data["text"]) analyzer = AnalyzerEngine() nik_pattern = Pattern(name="NIK Pattern", regex=r"bd{16}b", score=0.9) analyzer.registry.add_recognizer(PatternRecognizer("NIK_ID", [nik_pattern])) results = analyzer.analyze(text=text, entities=["NIK_ID"], language="en") for i in range(len(ocr_data["text"])): if any(extracted_text[r.start:r.end] in ocr_data["text"][i] for r in results): cv2.rectangle(image, (ocr_data["left"][i], ocr_data["top"][i]), (ocr_data["left"][i] + ocr_data["width"][i], ocr_data["top"][i] + ocr_data["height"][i]), (0, 0, 0), -1) cv2.imwrite("ktp_redacted.jpeg", image)

OCR + Presidio Pipeline

from presidio_analyzer import AnalyzerEngine from presidio_anonymizer import AnonymizerEngine

# OCR result from Tesseract extracted_text = "NIK 3171234567890123, Name: Mira Setiawan"analyzer = AnalyzerEngine() anonymizer = AnonymizerEngine()results = analyzer.analyze(text=extracted_text, language="en") redacted = anonymizer.anonymize(text=extracted_text, analyzer_results=results)print("=== Redacted ===") print(redacted.text)

Visual Redaction with Bounding Boxes

import cv2 import pytesseract image = cv2.imread("ktp_sample.jpg") data = pytesseract.image_to_data(image, output_type=pytesseract.Output.DICT) for i, word in numerate(data["text"]): if word.isdigit() and len(word) == 16: x, y, w, h = data["left"][i], data["top"][i], data["width"][i], data["height"][i] cv2.rectangle(image, (x, y), (x + w, y + h), (0, 0, 0), -1) cv2.imwrite("ktp_redacted.jpg", image)

Articles

From Pixels to Privacy Using OCR Microsoft Presidio to Redact Sensitive Data in ID Cards

By RabinsXP Team

December 26, 2025

Appendix: Code Snippets

Text Analysis with Presidio

OCR Extraction with Tesseract

OCR + Presidio Pipeline

Visual Redaction with Bounding Boxes