# Written by ChatGPT-5. Creates three HTML pages for easier viewing of arXiv, arranged by my personal preferences :
# Priority : Astrophysics of Galaxies, Astronomical Instrumentation and Methods
# Interest : Education, History, Popular, Philosophy
# Main : Everything else in Astrophysics
# Each HTML page is named according to the date and includes titles which link to the paper's main arXiv page, together
# with a click-to-reveal abstract.

# Useage : simply run the script and give it a date (or omit the date completely to default to the current date) :
# python DailyArXiV.py --date YYYY-MM-DD
# Add --debug if things go wrong.
# You should also set your email address in "USER_AGENT" so that arXiv can identify if you're causing them any problems
# (it will probably work fine without this though).


#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Daily arXiv pages for one announcement day (14:00 ET → 14:00 ET):
  - Priority : astro-ph.GA + astro-ph.IM
  - Main     : astro-ph.CO + astro-ph.EP + astro-ph.HE + astro-ph.SR
  - Interest : physics.hist-ph + physics.ed-ph + physics.pop-ph

Enhancements:
  * Auto-fallback to the most recent *announced* day when --date is omitted.
  * Big H1 title with British-style date (e.g. '1st September 2025').
  * Total count + numbered items.
  * Click-to-reveal abstracts via <details>/<summary>.

Python 3.5 compatible, stdlib only.
"""

from __future__ import print_function

import sys, os, argparse, datetime, time, xml.etree.ElementTree as ET

try:
    from urllib.request import urlopen, Request
    from urllib.parse import urlencode
except ImportError:
    from urllib2 import urlopen, Request
    from urllib import urlencode

API_BASE = "http://export.arxiv.org/api/query"  # HTTP avoids flaky TLS timeouts
USER_AGENT = "arxiv-daily-html (contact: example@example.com)"  # ← put your email

PRIORITY = ["astro-ph.GA", "astro-ph.IM"]
MAIN     = ["astro-ph.CO", "astro-ph.EP", "astro-ph.HE", "astro-ph.SR"]
INTEREST = ["physics.hist-ph", "physics.ed-ph", "physics.pop-ph"]

NS = {"atom": "http://www.w3.org/2005/Atom", "arxiv": "http://arxiv.org/schemas/atom"}

# ---------- date / time helpers ----------

def yymmdd(d): return "{:02d}{:02d}{:02d}".format(d.year % 100, d.month, d.day)

def ordinal(n):
    if 10 <= (n % 100) <= 20: suf = "th"
    else: suf = {1:"st",2:"nd",3:"rd"}.get(n % 10, "th")
    return str(n) + suf

def british_date(d):
    month = ["January","February","March","April","May","June","July","August",
             "September","October","November","December"][d.month - 1]
    return "{} {} {}".format(ordinal(d.day), month, d.year)

def parse_atom_dt(s):
    return datetime.datetime.strptime(s, "%Y-%m-%dT%H:%M:%SZ")  # naive UTC

def nth_weekday(year, month, weekday, n):
    first = datetime.date(year, month, 1)
    delta = (weekday - first.weekday() + 7) % 7
    return first + datetime.timedelta(days=delta + 7 * (n - 1))

def eastern_utc_offset_hours(et_date):
    dst_start = nth_weekday(et_date.year, 3, 6, 2)  # 2nd Sunday in March
    dst_end   = nth_weekday(et_date.year, 11, 6, 1) # 1st Sunday in Nov
    return 4 if (et_date >= dst_start and et_date < dst_end) else 5

def announcement_window_utc(et_day):
    """ 14:00 ET of previous day → 14:00 ET of et_day (both converted to UTC). """
    start_et_date = et_day - datetime.timedelta(days=1)
    start_utc = datetime.datetime(start_et_date.year, start_et_date.month, start_et_date.day, 14, 0) \
                + datetime.timedelta(hours=eastern_utc_offset_hours(start_et_date))
    end_utc   = datetime.datetime(et_day.year, et_day.month, et_day.day, 14, 0) \
                + datetime.timedelta(hours=eastern_utc_offset_hours(et_day))
    return start_utc, end_utc

def today_et_date(now_utc=None):
    if now_utc is None:
        now_utc = datetime.datetime.utcnow()
    tentative = (now_utc - datetime.timedelta(hours=4)).date()
    return (now_utc - datetime.timedelta(hours=eastern_utc_offset_hours(tentative))).date()

# ---------- HTTP / API ----------

def http_get(url, timeout=20, retries=3, debug=False):
    req = Request(url, headers={"User-Agent": USER_AGENT})
    last_err = None
    for attempt in range(retries):
        try:
            r = urlopen(req, timeout=timeout)
            data = r.read()
            r.close()
            return data
        except Exception as e:
            last_err = e
            time.sleep(3 + 2 * attempt)
    if debug:
        print("HTTP error after retries:", last_err, file=sys.stderr)
    raise last_err

def query_recent(cat, max_results=1000, debug=False):
    params = {
        "search_query": "cat:%s" % cat,  # let urlencode insert '+'
        "start": "0",
        "max_results": str(max_results),
        "sortBy": "lastUpdatedDate",
        "sortOrder": "descending",
    }
    url = API_BASE + "?" + urlencode(params)
    if debug: print("GET", url)
    data = http_get(url, debug=debug)
    time.sleep(3)  # be polite
    return data

def parse_entries(xml_bytes, debug=False):
    root = ET.fromstring(xml_bytes)
    out = []
    for e in root.findall("atom:entry", NS):
        eid   = e.findtext("atom:id", "", NS)
        title = e.findtext("atom:title", "", NS).strip().replace("\n", " ")
        summ  = e.findtext("atom:summary", "", NS).strip()
        published = e.findtext("atom:published", "", NS)
        updated   = e.findtext("atom:updated", "", NS)
        prim  = e.find("arxiv:primary_category", NS)
        primary_cat = prim.get("term") if prim is not None else ""
        versions = e.findall("arxiv:version", NS)

        base_id = eid.rsplit("/", 1)[-1].split("v")[0]
        link = "https://arxiv.org/abs/" + base_id

        pub_dt = parse_atom_dt(published) if published else None
        upd_dt = parse_atom_dt(updated)   if updated   else None

        out.append({
            "id": base_id,
            "title": title,
            "summary": summ,
            "link": link,
            "primary_cat": primary_cat,
            "published_utc": pub_dt,
            "updated_utc":   upd_dt,
            "version_count": len(versions),
        })
    return out

# ---------- core selection ----------

def collect_for_day(categories, et_day, debug=False):
    win_start, win_end = announcement_window_utc(et_day)
    if debug: print("Window UTC:", win_start, "→", win_end)
    bag = {}
    for cat in categories:
        xml = query_recent(cat, max_results=1000, debug=debug)
        entries = parse_entries(xml, debug=debug)
        for it in entries:
            if it["primary_cat"] != cat:  # exclude cross-lists
                continue
            u = it["updated_utc"]
            if u is None:
                continue
            if u < win_start:
                break  # sorted by lastUpdatedDate desc
            if u < win_end:
                bag[it["id"]] = it
    return bag

# ---------- HTML ----------

def html_escape(s):
    # Minimal escaping for safety
    return s.replace("&","&amp;").replace("<","&lt;").replace(">","&gt;")

def write_html(entries, et_day, label, out_dir):
    if not os.path.isdir(out_dir):
        os.makedirs(out_dir)
    stamp = yymmdd(et_day)
    path = os.path.join(out_dir, "{}_{}.html".format(stamp, label))
    items = list(entries.values())
    items.sort(key=lambda it: ((it["updated_utc"] or it["published_utc"] or datetime.datetime(1900,1,1)), it["title"].lower()))
    count = len(items)

    title_text = "ArXiv {} Feed for {}".format(label.capitalize(), british_date(et_day))

    with open(path, "w", encoding="utf-8") if sys.version_info[0] >= 3 else open(path, "w") as f:
        f.write(u'<!doctype html>\n<html lang="en"><head><meta charset="utf-8">\n')
        f.write(u"<title>{}</title>\n".format(html_escape(title_text)))
        # Tiny bit of style for readability
        f.write(u"<style>body{font-family:system-ui,-apple-system,Segoe UI,Roboto,Arial,sans-serif;max-width:800px;margin:40px auto;line-height:1.5} h1{font-size:1.6em;margin:0 0 0.2em 0} .meta{color:#555;margin:0 0 1em 0} details{margin:0 0 1em 0} summary{font-weight:600;cursor:pointer} .abs{margin:0.4em 0 0 1.2em;white-space:pre-wrap}</style>\n")
        f.write(u"</head><body>\n")
        f.write(u"<h1>{}</h1>\n".format(html_escape(title_text)))
        f.write(u"<p class=\"meta\">Total entries: {}</p>\n".format(count))

        # Numbered list via explicit counters for simple spacing
        for idx, it in enumerate(items, 1):
            title_line = u"{:d}. <a href=\"{link}\">{title}</a>".format(idx, link=it["link"], title=html_escape(it["title"]))
            f.write(u"<details>\n  <summary>{}</summary>\n".format(title_line))
            abs_text = html_escape(it.get("summary", "").strip())
            if abs_text:
                f.write(u"  <div class=\"abs\">{}</div>\n".format(abs_text))
            f.write(u"</details>\n")

        f.write(u"</body></html>\n")
    return path

# ---------- main with auto-fallback ----------

def build_for_et_day(et_day, out_dir, debug=False):
    pr  = collect_for_day(PRIORITY, et_day, debug=debug)
    ma  = collect_for_day(MAIN,     et_day, debug=debug)
    intr= collect_for_day(INTEREST, et_day, debug=debug)
    if debug: print("Counts → priority:", len(pr), " main:", len(ma), " interest:", len(intr))
    p = write_html(pr,   et_day, "priority", out_dir)
    m = write_html(ma,   et_day, "main",     out_dir)
    i = write_html(intr, et_day, "interest", out_dir)
    return (len(pr)+len(ma)+len(intr)), (p, m, i)

def main():
    ap = argparse.ArgumentParser(description="Generate arXiv title-only pages for one announcement day (ET 14:00→14:00), with abstracts.")
    ap.add_argument("--date", help="Target announcement day in YYYY-MM-DD (US Eastern). Default: auto-detect most recent announcement.", default=None)
    ap.add_argument("--out",  help="Output directory. Default: ./docs", default="docs")
    ap.add_argument("--debug", action="store_true", help="Verbose debug output")
    args = ap.parse_args()

    if args.date:
        try:
            et_day = datetime.datetime.strptime(args.date, "%Y-%m-%d").date()
        except ValueError:
            print("Invalid --date format. Use YYYY-MM-DD.", file=sys.stderr); return 2
        total, paths = build_for_et_day(et_day, args.out, debug=args.debug)
        print("Wrote:\n  {}\n  {}\n  {}".format(paths[0], paths[1], paths[2]))
        return 0

    # Auto mode : pick the most recent day with an announcement
    probe = today_et_date()
    for back in range(0, 5):  # today, then up to 4 prior ET days
        et_try = probe - datetime.timedelta(days=back)
        if args.debug: print("ET announcement day candidate:", et_try)
        total, paths = build_for_et_day(et_try, args.out, debug=args.debug)
        if total > 0:
            if args.debug and back > 0:
                print("No announcement detected for more recent day(s), used:", et_try)
            print("Wrote:\n  {}\n  {}\n  {}".format(paths[0], paths[1], paths[2]))
            return 0

    print("No announcements found in the last 5 ET days — nothing to write.", file=sys.stderr)
    return 1

if __name__ == "__main__":
    sys.exit(main())
