File size: 1,767 Bytes
11a28db | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 | """
Booktitle normalizer: maps verbose venue names to standard abbreviations.
Loads rules from data/abbr.tsv (regex → abbreviation).
"""
import re
import csv
from pathlib import Path
from typing import Optional
class BooktitleNormalizer:
"""Normalizes booktitle/journal names to standard abbreviations."""
def __init__(self, tsv_path: str = None):
if tsv_path is None:
tsv_path = str(Path(__file__).resolve().parent.parent / "data" / "abbr.tsv")
self.rules: list[tuple[re.Pattern, str]] = []
self._load_rules(tsv_path)
def _load_rules(self, tsv_path: str):
"""Load regex → abbreviation rules from TSV file."""
path = Path(tsv_path)
if not path.exists():
return
with open(path, 'r', encoding='utf-8') as f:
reader = csv.reader(f, delimiter='\t')
for row in reader:
if len(row) >= 2:
pattern_str = row[0].strip()
abbr = row[1].strip()
# Skip comments and empty lines
if not pattern_str or pattern_str.startswith('#'):
continue
try:
self.rules.append((re.compile(pattern_str, re.IGNORECASE), abbr))
except re.error:
pass # Skip invalid regex
def normalize(self, booktitle: str) -> Optional[str]:
"""
Normalize a booktitle to its standard abbreviation.
Returns the abbreviation if matched, None if no match found.
"""
if not booktitle:
return None
for pattern, abbr in self.rules:
if pattern.search(booktitle):
return abbr
return None
|