#!/usr/bin/env python3
import re, requests, datetime
from bs4 import BeautifulSoup
from typing import List, Tuple, Dict, Set
UA = {"User-Agent": "arxiv-onecall-html/1.1 (+you@example.com)"}
BASE = "https://arxiv.org"
PRIORITY_CODES: Set[str] = {"astro-ph.GA", "astro-ph.IM"}
INTEREST_CATS: List[str] = ["physics.hist-ph", "physics.ed-ph", "physics.pop-ph"]
HUMAN_NAMES: Dict[str, str] = {
"astro-ph.GA": "Astrophysics of Galaxies",
"astro-ph.IM": "Instrumentation and Methods",
"physics.hist-ph": "History & Philosophy of Physics",
"physics.ed-ph": "Physics Education",
"physics.pop-ph": "Popular Physics",
}
def fetch_list_html(category: str) -> str:
url = f"{BASE}/list/{category}/new"
r = requests.get(url, headers=UA, timeout=30)
r.raise_for_status()
return r.text
def parse_list(html: str) -> List[Dict[str, object]]:
"""Return list of entries: {id, title, abstract, abs_url, codes(set[str])}"""
soup = BeautifulSoup(html, "html.parser")
dts = soup.select("dl#articles > dt")
dds = soup.select("dl#articles > dd")
entries: List[Dict[str, object]] = []
for dt, dd in zip(dts, dds):
a = dt.select_one('a[href^="/abs/"]')
if not a:
continue
m = re.search(r"/abs/(\d{4}\.\d{5}(?:v\d+)?)", a.get("href", ""))
if not m:
continue
arxid = m.group(1)
abs_url = BASE + a["href"]
# Title
t_el = dd.select_one(".list-title")
title = t_el.get_text(" ", strip=True).replace("Title:", "", 1).strip() if t_el else ""
# Abstract
p = dd.select_one("p.mathjax")
abstract = p.get_text(" ", strip=True) if p else ""
if abstract.startswith("Abstract:"):
abstract = abstract[len("Abstract:"):].strip()
# Subjects → codes like (astro-ph.GA), (physics.hist-ph)
subj_el = dd.select_one(".list-subjects")
subj_txt = subj_el.get_text(" ", strip=True) if subj_el else ""
codes: Set[str] = set(re.findall(r"\(([A-Za-z0-9.\-]+)\)", subj_txt))
# Apply LaTeX-like text normalizer outside math regions
title = normalize_tex_like(title)
abstract = normalize_tex_like(abstract)
entries.append({
"id": arxid,
"title": title,
"abstract": abstract,
"abs_url": abs_url,
"codes": codes,
})
return entries
def escape_html(s: str) -> str:
return (s.replace("&","&")
.replace("<","<")
.replace(">",">")
.replace('"',"""))
def ordinal_day(d: int) -> str:
if 10 <= d % 100 <= 20: suf = "th"
else: suf = {1:"st",2:"nd",3:"rd"}.get(d % 10, "th")
return f"{d}{suf}"
def nice_date(dt: datetime.date) -> str:
return f"{ordinal_day(dt.day)} {dt.strftime('%B %Y')}"
def html_head(page_title: str) -> str:
mj_config = """
"""
return f"""
{escape_html(page_title)}
{mj_config}
"""
def html_tail() -> str:
return "\n"
def html_section(heading: str, items: List[Dict[str, object]], start_index: int = 1) -> Tuple[str, int]:
parts = [f"{escape_html(heading)}
"]
n = start_index
for it in items:
title = (it["title"] or it["id"]) # type: ignore
parts.append(
f''
f''
f'Abstract
{escape_html(str(it["abstract"]))}
'
f''
)
n += 1
return "".join(parts), n
def html_page(page_title: str, date_str: str, top_subtitle: str,
sections: List[Tuple[str, List[Dict[str, object]]]]) -> str:
total = sum(len(items) for _, items in sections)
head = html_head(f"{page_title} — {date_str}")
# Build category list for this page (only categories that actually have items)
cats = [subheader for (subheader, items) in sections if items]
cats_str = "; ".join(cats)
# Subtitle: date · categories (if any) · total entries
meta_bits = [date_str]
if cats_str:
meta_bits.append(cats_str)
meta_bits.append(f'{total} entr{"y" if total==1 else "ies"}')
body = [
f"{escape_html(page_title)}
",
f'{escape_html(" · ".join(meta_bits))}
'
]
if top_subtitle:
body.append(f"{escape_html(top_subtitle)}
")
i = 1
for subheader, items in sections:
if not items:
continue
sec_html, i = html_section(subheader, items, start_index=i)
body.append(sec_html)
return head + "\n".join(body) + html_tail()
# -----------------------------
# LaTeX-like text normalizer
# -----------------------------
# Match math regions and KEEP delimiters so MathJax can render them.
_MATH_RE = re.compile(
r'(\\\(.+?\\\)|\\\[.+?\\\]|\$\$.*?\$\$|\$.*?\$)',
re.DOTALL
)
_tex_replacements = [
(re.compile(r"---"), "—"), # em dash
(re.compile(r"--"), "–"), # en dash
(re.compile(r"``"), "“"), # opening double quote
(re.compile(r"''"), "”"), # closing double quote
(re.compile(r"\\,\s*"), "\u2009"),# thin space
(re.compile(r"\\;"), "\u2005"), # four-per-em / medium space
(re.compile(r"\\:"), "\u2005"), # medium space
(re.compile(r"\\!"), ""), # negative thin space → drop
(re.compile(r"~"), "\u00A0"), # non-breaking space
(re.compile(r"\\-"), ""), # discretionary hyphen → drop
(re.compile(r"\\/"), ""), # italic correction → drop
(re.compile(r"\\&"), "&"), # escaped ampersand
]
def _apply_tex_outside_math(s: str) -> str:
out: List[str] = []
pos = 0
for m in _MATH_RE.finditer(s):
# Non-math text before the math region
if m.start() > pos:
chunk = s[pos:m.start()]
for pat, repl in _tex_replacements:
chunk = pat.sub(repl, chunk)
out.append(chunk)
# Math region itself (keep exactly as-is, including delimiters)
out.append(m.group(0))
pos = m.end()
# Trailing non-math text
if pos < len(s):
chunk = s[pos:]
for pat, repl in _tex_replacements:
chunk = pat.sub(repl, chunk)
out.append(chunk)
return "".join(out)
def normalize_tex_like(s: str) -> str:
"""
Apply a few typographic substitutions commonly used in LaTeX text,
but leave math segments (delimited by $...$, $$...$$, \(...\), \[...\]) untouched.
"""
if not s:
return s
return _apply_tex_outside_math(s)
# -----------------------------
def main():
today = datetime.date.today()
stamp = today.strftime("%Y%m%d")
date_long = nice_date(today)
# ASTRO — Priority/Main
astro_entries = parse_list(fetch_list_html("astro-ph"))
pri_ga: List[Dict[str, object]] = []
pri_im: List[Dict[str, object]] = []
main_rest: List[Dict[str, object]] = []
for e in astro_entries:
if "astro-ph.GA" in e["codes"]:
pri_ga.append(e)
elif "astro-ph.IM" in e["codes"]:
pri_im.append(e)
else:
main_rest.append(e)
# INTEREST — per physics subcat
interest_groups: List[Tuple[str, List[Dict[str, object]]]] = []
for cat in INTEREST_CATS:
ents = parse_list(fetch_list_html(cat))
interest_groups.append((HUMAN_NAMES.get(cat, cat), ents))
# Write files
priority_sections: List[Tuple[str, List[Dict[str, object]]]] = [
(HUMAN_NAMES["astro-ph.GA"], pri_ga),
(HUMAN_NAMES["astro-ph.IM"], pri_im),
]
with open(f"{stamp}_Priority.html", "w", encoding="utf-8") as f:
f.write(html_page("Priority", date_long, "", priority_sections))
main_sections: List[Tuple[str, List[Dict[str, object]]]] = [("Astrophysics (other categories)", main_rest)]
with open(f"{stamp}_Main.html", "w", encoding="utf-8") as f:
f.write(html_page("Main", date_long, "", main_sections))
with open(f"{stamp}_Interest.html", "w", encoding="utf-8") as f:
f.write(html_page("Interest", date_long, "", interest_groups))
if __name__ == "__main__":
main()