Spaces:

google
/

radextract

Paused

App Files Files Community

radextract / sanitize.py

goelak

Initial commit for RadExtract

fab8051 5 months ago

raw

history blame

3.09 kB

	"""Text preprocessing for radiology reports with complex Unicode and formatting.

	Handles reports containing complex Unicode symbolic characters and non-standard
	structural formatting that are not currently supported by the prompt and LangExtract
	library. Prevents timeout issues by normalizing problematic characters and structures
	to formats compatible with downstream processing.

	Typical usage example:

	from sanitize import preprocess_report

	clean_text = preprocess_report(raw_report)
	"""

	from __future__ import annotations

	import re

	import ftfy

	_TRANSLATE = str.maketrans(
	{
	0x2022: "*",
	0x25CF: "*",
	0x27A1: "->",
	0xF0E0: "->",
	0x2192: "->",
	0x2190: "<-",
	0x00D7: "x",
	0x2191: "up",
	0x2642: "male",
	0x2640: "female",
	0x2010: "-",
	0x2013: "-",
	0x2014: "-",
	0x00A0: " ",
	}
	)

	_WS = re.compile(r"[ \t]+")
	_BLANKS = re.compile(r"\n\s\n\s\n+")

	# Structure normalization patterns
	_BEGIN = re.compile(r"---\sBEGIN [^-]+---\n", re.I)
	_END = re.compile(r"\n---\sEND [^-]+---\s*", re.I)
	_HEADER = re.compile(r"\{3}\s([^]+?)\s\*{3}", re.I)
	_BULLET_HDR = re.compile(r"^[ \t][\\u2022\u25CF-]+\s*", re.M)
	_ENUM = re.compile(r"^[ \t]*(\d+)[\)\.][ \t]+", re.M)


	def sanitize_text(text: str) -> str:
	"""Sanitizes Unicode characters and normalizes whitespace.

	Applies ftfy text repair, translates problematic Unicode symbols to ASCII
	equivalents, normalizes whitespace, and removes excessive blank lines.

	Args:
	text: The input text to sanitize.

	Returns:
	Sanitized text with Unicode issues resolved and whitespace normalized.
	"""
	out = ftfy.fix_text(text, remove_control_chars=True, normalization="NFC")
	out = out.translate(_TRANSLATE)
	out = _WS.sub(" ", out)
	out = out.replace("\r\n", "\n").replace("\r", "\n")
	out = _BLANKS.sub("\n\n", out)
	return out.strip()


	def normalize_structure(text: str) -> str:
	"""Normalizes structural elements in radiology reports.

	Removes report wrappers, converts asterisk headers to colon format,
	removes bullet prefixes, and standardizes enumerations.

	Args:
	text: The input text to normalize.

	Returns:
	Text with structural elements normalized for consistent formatting.
	"""
	text = _BEGIN.sub("", text)
	text = _END.sub("", text)
	text = _HEADER.sub(lambda m: f"{m.group(1).strip()}:", text)
	text = _BULLET_HDR.sub("", text)
	text = _ENUM.sub(lambda m: f"{m.group(1)}. ", text)
	return text.strip()


	def preprocess_report(raw: str) -> str:
	"""Preprocesses radiology reports with sanitization and normalization.

	Combines Unicode sanitization and structural normalization to prepare
	radiology reports for downstream processing. This is the main entry point
	for text preprocessing.

	Args:
	raw: The raw radiology report text.

	Returns:
	Preprocessed text ready for structured extraction.
	"""
	return normalize_structure(sanitize_text(raw))