From Pixels to Privacy Using OCR Microsoft Presidio to Redact Sensitive Data in ID Cards
Appendix: Code Snippets
Text Analysis with Presidio
import re
import cv2
import pytesseract
from presidio_analyzer import AnalyzerEngine, Pattern, PatternRecognizer
from presidio_anonymizer import AnonymizerEngine
from PIL import Imagetext = pytesseract.image_to_string(cv2.imread("ktp_sample.jpg"))
analyzer = AnalyzerEngine()
analyzer.registry.add_recognizer(PatternRecognizer("INDONESIAN_NIK", [Pattern("NIK", r"bd{16}b", 0.9)]))
results = analyzer.analyze(text=text, entities=["INDONESIAN_NIK"], language="en")
anonymized = AnonymizerEngine().anonymize(text=text, analyzer_results=results)
print(anonymized.text)OCR Extraction with Tesseract
import cv2, pytesseract
from presidio_analyzer import AnalyzerEngine, Pattern, PatternRecognizer
from pytesseract import Output
pytesseract.pytesseract.tesseract_cmd = r"C:Program FilesTesseract-OCRtesseract.exe"
image = cv2.imread("ktp_sample.jpeg")
ocr_data = pytesseract.image_to_data(image, output_type=Output.DICT)
text = " ".join(ocr_data["text"])
analyzer = AnalyzerEngine()
nik_pattern = Pattern(name="NIK Pattern", regex=r"bd{16}b", score=0.9)
analyzer.registry.add_recognizer(PatternRecognizer("NIK_ID", [nik_pattern]))
results = analyzer.analyze(text=text, entities=["NIK_ID"], language="en")
for i in range(len(ocr_data["text"])):
if any(extracted_text[r.start:r.end] in ocr_data["text"][i] for r in results):
cv2.rectangle(image, (ocr_data["left"][i], ocr_data["top"][i]), (ocr_data["left"][i] + ocr_data["width"][i], ocr_data["top"][i] + ocr_data["height"][i]), (0, 0, 0), -1)
cv2.imwrite("ktp_redacted.jpeg", image)OCR + Presidio Pipeline
from presidio_analyzer import AnalyzerEngine
from presidio_anonymizer import AnonymizerEngine# OCR result from Tesseract extracted_text = "NIK 3171234567890123, Name: Mira Setiawan"analyzer = AnalyzerEngine() anonymizer = AnonymizerEngine()results = analyzer.analyze(text=extracted_text, language="en") redacted = anonymizer.anonymize(text=extracted_text, analyzer_results=results)print("=== Redacted ===") print(redacted.text)
Visual Redaction with Bounding Boxes
import cv2
import pytesseract
image = cv2.imread("ktp_sample.jpg")
data = pytesseract.image_to_data(image, output_type=pytesseract.Output.DICT)
for i, word in numerate(data["text"]):
if word.isdigit() and len(word) == 16:
x, y, w, h = data["left"][i], data["top"][i], data["width"][i], data["height"][i]
cv2.rectangle(image, (x, y), (x + w, y + h), (0, 0, 0), -1)
cv2.imwrite("ktp_redacted.jpg", image)
Recent Comments