echo ""; echo "";

From Pixels to Privacy Using OCR Microsoft Presidio to Redact Sensitive Data in ID Cards


Appendix: Code Snippets

Text Analysis with Presidio

import re
import cv2
import pytesseract
from presidio_analyzer import AnalyzerEngine, Pattern, PatternRecognizer
from presidio_anonymizer import AnonymizerEngine
from PIL import Image
text = pytesseract.image_to_string(cv2.imread("ktp_sample.jpg"))
analyzer = AnalyzerEngine()
analyzer.registry.add_recognizer(PatternRecognizer("INDONESIAN_NIK", [Pattern("NIK", r"bd{16}b", 0.9)]))
results = analyzer.analyze(text=text, entities=["INDONESIAN_NIK"], language="en")
anonymized = AnonymizerEngine().anonymize(text=text, analyzer_results=results)
print(anonymized.text)

OCR Extraction with Tesseract

import cv2, pytesseract
from presidio_analyzer import AnalyzerEngine, Pattern, PatternRecognizer
from pytesseract import Output

pytesseract.pytesseract.tesseract_cmd = r"C:Program FilesTesseract-OCRtesseract.exe"
image = cv2.imread("ktp_sample.jpeg")
ocr_data = pytesseract.image_to_data(image, output_type=Output.DICT)

text = " ".join(ocr_data["text"])
analyzer = AnalyzerEngine()
nik_pattern = Pattern(name="NIK Pattern", regex=r"bd{16}b", score=0.9)
analyzer.registry.add_recognizer(PatternRecognizer("NIK_ID", [nik_pattern]))

results = analyzer.analyze(text=text, entities=["NIK_ID"], language="en")
for i in range(len(ocr_data["text"])):
    if any(extracted_text[r.start:r.end] in ocr_data["text"][i] for r in results):
        cv2.rectangle(image, (ocr_data["left"][i], ocr_data["top"][i]), (ocr_data["left"][i] + ocr_data["width"][i], ocr_data["top"][i] + ocr_data["height"][i]), (0, 0, 0), -1)

cv2.imwrite("ktp_redacted.jpeg", image)

OCR + Presidio Pipeline

from presidio_analyzer import AnalyzerEngine
from presidio_anonymizer import AnonymizerEngine
# OCR result from Tesseract
extracted_text = "NIK 3171234567890123, Name: Mira Setiawan"analyzer = AnalyzerEngine()
anonymizer = AnonymizerEngine()results = analyzer.analyze(text=extracted_text, language="en")
redacted = anonymizer.anonymize(text=extracted_text, analyzer_results=results)print("=== Redacted ===")
print(redacted.text)

Visual Redaction with Bounding Boxes

import cv2
import pytesseract
image = cv2.imread("ktp_sample.jpg")
data = pytesseract.image_to_data(image, output_type=pytesseract.Output.DICT)
for i, word in numerate(data["text"]):
    if word.isdigit() and len(word) == 16:  
        x, y, w, h = data["left"][i], data["top"][i], data["width"][i], data["height"][i]
        cv2.rectangle(image, (x, y), (x + w, y + h), (0, 0, 0), -1)
cv2.imwrite("ktp_redacted.jpg", image)

Leave a Reply

Your email address will not be published. Required fields are marked *

WordPress Appliance - Powered by TurnKey Linux