Spaces:
Paused
Paused
| """Text preprocessing for radiology reports with complex Unicode and formatting. | |
| Handles reports containing complex Unicode symbolic characters and non-standard | |
| structural formatting that are not currently supported by the prompt and LangExtract | |
| library. Prevents timeout issues by normalizing problematic characters and structures | |
| to formats compatible with downstream processing. | |
| Typical usage example: | |
| from sanitize import preprocess_report | |
| clean_text = preprocess_report(raw_report) | |
| """ | |
| from __future__ import annotations | |
| import re | |
| import ftfy | |
| _TRANSLATE = str.maketrans( | |
| { | |
| 0x2022: "*", | |
| 0x25CF: "*", | |
| 0x27A1: "->", | |
| 0xF0E0: "->", | |
| 0x2192: "->", | |
| 0x2190: "<-", | |
| 0x00D7: "x", | |
| 0x2191: "up", | |
| 0x2642: "male", | |
| 0x2640: "female", | |
| 0x2010: "-", | |
| 0x2013: "-", | |
| 0x2014: "-", | |
| 0x00A0: " ", | |
| } | |
| ) | |
| _WS = re.compile(r"[ \t]+") | |
| _BLANKS = re.compile(r"\n\s*\n\s*\n+") | |
| # Structure normalization patterns | |
| _BEGIN = re.compile(r"---\s*BEGIN [^-]+---\n*", re.I) | |
| _END = re.compile(r"\n*---\s*END [^-]+---\s*", re.I) | |
| _HEADER = re.compile(r"\*{3}\s*([^*]+?)\s*\*{3}", re.I) | |
| _BULLET_HDR = re.compile(r"^[ \t]*[\*\u2022\u25CF-]+\s*", re.M) | |
| _ENUM = re.compile(r"^[ \t]*(\d+)[\)\.][ \t]+", re.M) | |
| def sanitize_text(text: str) -> str: | |
| """Sanitizes Unicode characters and normalizes whitespace. | |
| Applies ftfy text repair, translates problematic Unicode symbols to ASCII | |
| equivalents, normalizes whitespace, and removes excessive blank lines. | |
| Args: | |
| text: The input text to sanitize. | |
| Returns: | |
| Sanitized text with Unicode issues resolved and whitespace normalized. | |
| """ | |
| out = ftfy.fix_text(text, remove_control_chars=True, normalization="NFC") | |
| out = out.translate(_TRANSLATE) | |
| out = _WS.sub(" ", out) | |
| out = out.replace("\r\n", "\n").replace("\r", "\n") | |
| out = _BLANKS.sub("\n\n", out) | |
| return out.strip() | |
| def normalize_structure(text: str) -> str: | |
| """Normalizes structural elements in radiology reports. | |
| Removes report wrappers, converts asterisk headers to colon format, | |
| removes bullet prefixes, and standardizes enumerations. | |
| Args: | |
| text: The input text to normalize. | |
| Returns: | |
| Text with structural elements normalized for consistent formatting. | |
| """ | |
| text = _BEGIN.sub("", text) | |
| text = _END.sub("", text) | |
| text = _HEADER.sub(lambda m: f"{m.group(1).strip()}:", text) | |
| text = _BULLET_HDR.sub("", text) | |
| text = _ENUM.sub(lambda m: f"{m.group(1)}. ", text) | |
| return text.strip() | |
| def preprocess_report(raw: str) -> str: | |
| """Preprocesses radiology reports with sanitization and normalization. | |
| Combines Unicode sanitization and structural normalization to prepare | |
| radiology reports for downstream processing. This is the main entry point | |
| for text preprocessing. | |
| Args: | |
| raw: The raw radiology report text. | |
| Returns: | |
| Preprocessed text ready for structured extraction. | |
| """ | |
| return normalize_structure(sanitize_text(raw)) | |