Initial commit.

2026-01-10 21:58:42 +00:00
commit 9ab19b842d
7 changed files with 4528 additions and 0 deletions
--- a/18
+++ b/18
@@ -0,0 +1,18 @@
+CXX=g++
+LEX=flex
+YACC=bison
+CXXFLAGS?=-std=c++17
+# GCC 13+ can warn on Bison skeleton code even though it's guarded by `if (yyss != yyssa)`.
+CXXWARN?=-Wno-free-nonheap-object
+SRCDIR=src
+DISTDIR=dist
+$(shell mkdir -p $(DISTDIR))
+all: $(DISTDIR)/bibtex_compiler
+$(DISTDIR)/bibtex_lexer.cpp: $(SRCDIR)/bibtex_lexer.l
+	$(LEX) -o $(DISTDIR)/bibtex_lexer.cpp $(SRCDIR)/bibtex_lexer.l
+$(DISTDIR)/bibtex_parser.cpp $(DISTDIR)/bibtex_parser.hpp: $(SRCDIR)/bibtex_parser.y
+	$(YACC) -d -o $(DISTDIR)/bibtex_parser.cpp $(SRCDIR)/bibtex_parser.y
+$(DISTDIR)/bibtex_compiler: $(DISTDIR)/bibtex_lexer.cpp $(DISTDIR)/bibtex_parser.cpp $(DISTDIR)/bibtex_parser.hpp
+	$(CXX) $(CXXFLAGS) $(CXXWARN) -o $(DISTDIR)/bibtex_compiler $(DISTDIR)/bibtex_lexer.cpp $(DISTDIR)/bibtex_parser.cpp
+clean:
+	rm -f $(DISTDIR)/bibtex_lexer.cpp $(DISTDIR)/bibtex_parser.cpp $(DISTDIR)/bibtex_parser.hpp $(DISTDIR)/bibtex_compiler
--- a/build_and_test.sh
+++ b/build_and_test.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+set -e
+echo "Compiling BibTeX compiler..."
+make clean && make
+echo "Running compiler on sample.bib:"
+./dist/bibtex_compiler < test/sample.bib
+if [ $? -ne 0 ]; then
+    echo -e "[\033[0;31mFAILED\033[0m] Test 1 (sample.bib)."
+else
+    echo -e "[\033[0;32mPASSED\033[0m] Test 1 (sample.bib)."
+fi
+./dist/bibtex_compiler < test/A_Theory_of_Justice.bibtex
+if [ $? -ne 0 ]; then
+    echo -e "[\033[0;31mFAILED\033[0m] Test 2 (A_Theory_of_Justice.bibtex)."
+else
+    echo -e "[\033[0;32mPASSED\033[0m] Test 2 (A_Theory_of_Justice.bibtex)."
+fi
+./dist/bibtex_compiler < test/big_file.bib
+if [ $? -ne 0 ]; then
+    echo -e "[\033[0;31mFAILED\033[0m] Test 3 (big_file.bib)."
+else
+    echo -e "[\033[0;32mPASSED\033[0m] Test 3 (big_file.bib)."
+fi
+echo "All tests completed."
--- a/src/bibtex_lexer.l
+++ b/src/bibtex_lexer.l
@@ -0,0 +1,98 @@
+%{
+#include "bibtex_parser.hpp"
+#include <string>
+#include <cstring>
+extern int yylineno;
+extern YYLTYPE yylloc;
+static int yycolumn = 1;
+
+#define SET_LOC(len) \
+	do { \
+		yylloc.first_line = yylineno; \
+		yylloc.first_column = yycolumn; \
+		yylloc.last_line = yylineno; \
+		yylloc.last_column = yycolumn + (len) - 1; \
+		yycolumn += (len); \
+	} while (0)
+
+static void set_loc_for_text(const std::string& text) {
+	const int start_line = yylineno;
+	const int start_col = yycolumn;
+	int line = yylineno;
+	int col = yycolumn;
+	int last_line = line;
+	int last_col = col;
+	for (char ch : text) {
+		last_line = line;
+		last_col = col;
+		if (ch == '\n') {
+			line++;
+			col = 1;
+		} else {
+			col++;
+		}
+	}
+
+	yylineno = line;
+	yycolumn = col;
+	yylloc.first_line = start_line;
+	yylloc.first_column = start_col;
+	yylloc.last_line = last_line;
+	yylloc.last_column = last_col;
+}
+%}
+
+%option noyywrap
+%option yylineno
+
+%x VALUE
+
+%%
+"@"                { SET_LOC(yyleng); return AT; }
+"{"                { SET_LOC(yyleng); return LBRACE; }
+"}"                { SET_LOC(yyleng); return RBRACE; }
+","                { SET_LOC(yyleng); return COMMA; }
+"="                { SET_LOC(yyleng); BEGIN(VALUE); return EQUALS; }
+"#"                { SET_LOC(yyleng); return HASH; }
+
+[a-zA-Z0-9_][a-zA-Z0-9_.:;\-!?+*/]*   { SET_LOC(yyleng); yylval.str = new std::string(yytext); return IDENT; }
+
+\"[^\"]*\"   { SET_LOC(yyleng); yylval.str = new std::string(yytext); return STRING; }
+
+[%][^\n]*          { /* inline or full-line comment starting with %: ignore to EOL */ yycolumn += yyleng; }
+
+[ \t\r]+           { /* skip whitespace, update column */ yycolumn += yyleng; }
+
+\n+                 { /* newline: increment line is handled by flex when %option yylineno is used */ yycolumn = 1; }
+
+.                   { SET_LOC(yyleng); return yytext[0]; }
+
+<VALUE>[ \t\r]+           { yycolumn += yyleng; }
+<VALUE>\n+                { yycolumn = 1; }
+
+<VALUE>[%][^\n]*          { /* comment inside value: ignore to EOL */ yycolumn += yyleng; }
+
+<VALUE>\"[^\"]*\"   { SET_LOC(yyleng); yylval.str = new std::string(yytext); BEGIN(INITIAL); return STRING; }
+<VALUE>[a-zA-Z0-9_][a-zA-Z0-9_.:;\-!?+*/]*   { SET_LOC(yyleng); yylval.str = new std::string(yytext); BEGIN(INITIAL); return IDENT; }
+
+<VALUE>"{" {
+	std::string text;
+	text.push_back('{');
+	int depth = 1;
+	while (depth > 0) {
+		const int c = yyinput();
+		if (c == EOF) break;
+		text.push_back(static_cast<char>(c));
+		if (c == '{') depth++;
+		else if (c == '}') depth--;
+	}
+	set_loc_for_text(text);
+	yylval.str = new std::string(text);
+	BEGIN(INITIAL);
+	return STRING;
+}
+
+<VALUE>. { SET_LOC(yyleng); BEGIN(INITIAL); return yytext[0]; }
+
+%%
+
--- a/src/bibtex_parser.y
+++ b/src/bibtex_parser.y
@@ -0,0 +1,664 @@
+%{
+#include <map>
+#include <string>
+#include <vector>
+#include <iostream>
+#include <algorithm>
+#include <cctype>
+#include <cstddef>
+
+#include <set>
+#include <cstdlib>
+
+int yylex(void);
+void yyerror(const char* s);
+
+struct BibEntry {
+    std::string type;
+    std::string key;
+    std::map<std::string, std::string> fields;
+};
+std::vector<BibEntry> entries;
+
+static const std::size_t WRAP_COLUMNS = 80;
+
+// Preferred ordering of fields when printing.
+static const char* FIELD_ORDER[] = {
+    "title", "author", "year", "pages", "volume", "journal","publisher", "number",
+    "url", "booktitle", "mrnumber", "mrclass", "eprint", "issn", "doi", "fjournal",
+    "mrreviewer", "series", "isbn", "date", "organization", "note",
+    "month", "archiveprefix", "school", "editor", "primaryclass", "coden", "address",
+    "edition", "location", "howpublished", "page", "eprinttype", "chapter", "type",
+    "timestamp", "shortjournal", "pdf", "origpublisher", "origdate", "institution",
+    "eprintclass", "biburl", "bibsource", "urldate", "translator", "subtitle",
+    "shortseries", "place", "origlocation", "origlanguage", "lccn", "label", "keywords",
+    "issue_date", "ignorepdf", "hal_version", "hal_id", "eventtitle", "eventdate",
+    "abstract"
+};
+static const std::size_t FIELD_ORDER_COUNT = sizeof(FIELD_ORDER) / sizeof(FIELD_ORDER[0]);
+
+// Required fields per entry type.
+static const std::vector<std::string> REQUIRED_ARTICLE_FIELDS   = {"author", "title", "year"};
+static const std::set<std::string> REQUIRED_ARTICLE_JOURNAL_OR_HOWPUBLISHED = {"journal", "howpublished"};
+// TODO enable extra field checks via a --strict flag
+static const std::vector<std::string> REQUIRED_BOOK_FIELDS      = {"title", /* "publisher", */ "year"};
+static const std::vector<std::string> REQUIRED_INPROC_FIELDS    = {"author", "title", "booktitle", "year"};
+static const std::vector<std::string> REQUIRED_INCOLL_FIELDS    = {"author", "title", "booktitle", "publisher", "year"};
+static const std::vector<std::string> REQUIRED_THESIS_FIELDS    = {"author", "title", "school", "year"};
+static const std::vector<std::string> REQUIRED_TECHREPORT_FIELDS= {"author", "title", "institution", "year"};
+static const std::vector<std::string> REQUIRED_BOOKLET_FIELDS   = {"title"};
+static const std::set<std::string> REQUIRED_BOOK_AUTHOR_OR_EDITOR = {"author", "editor"};
+
+// Map from (lowercased) field name to its rank in FIELD_ORDER.
+static std::map<std::string, int> field_order_index;
+
+static std::string to_lower_copy(const std::string& s) {
+    std::string r = s;
+    std::transform(r.begin(), r.end(), r.begin(),
+                   [](unsigned char c) { return std::tolower(c); });
+    return r;
+}
+
+static std::string trim(const std::string& s) {
+    std::size_t start = 0;
+    while (start < s.size() && std::isspace(static_cast<unsigned char>(s[start]))) {
+        ++start;
+    }
+    std::size_t end = s.size();
+    while (end > start && std::isspace(static_cast<unsigned char>(s[end - 1]))) {
+        --end;
+    }
+    return s.substr(start, end - start);
+}
+
+static std::string normalize_one_author(const std::string& name) {
+    const std::string t = trim(name);
+    if (t.empty()) return t;
+
+    // Count commas to distinguish formats.
+    int comma_count = 0;
+    for (char c : t) {
+        if (c == ',') ++comma_count;
+    }
+
+    if (comma_count == 0) {
+        // Already in "First Last" style.
+        return t;
+    }
+
+    // Split by commas.
+    std::vector<std::string> parts;
+    std::size_t pos = 0;
+    while (true) {
+        std::size_t comma = t.find(',', pos);
+        if (comma == std::string::npos) {
+            parts.push_back(trim(t.substr(pos)));
+            break;
+        }
+        parts.push_back(trim(t.substr(pos, comma - pos)));
+        pos = comma + 1;
+    }
+
+    if (parts.size() == 2) {
+        // "Last, First" -> "First Last".
+        const std::string& last = parts[0];
+        const std::string& first = parts[1];
+        return first + " " + last;
+    }
+    if (parts.size() >= 3) {
+        // "Last, Suffix, First" (ignore any extra commas beyond the first three).
+        const std::string& last = parts[0];
+        const std::string& suffix = parts[1];
+        const std::string& first = parts[2];
+        // Canonical: "First Last Suffix".
+        return first + " " + last + " " + suffix;
+    }
+
+    return t;
+}
+
+static std::string normalize_author_inner(const std::string& inner) {
+    std::vector<std::string> authors;
+    std::size_t pos = 0;
+    const std::string and_tok = " and ";
+    const std::string lower_inner = to_lower_copy(inner);
+    const std::string and_tok_lower = to_lower_copy(and_tok);
+    while (true) {
+        std::size_t p = lower_inner.find(and_tok_lower, pos);
+        if (p == std::string::npos) {
+            authors.push_back(trim(inner.substr(pos)));
+            break;
+        }
+        authors.push_back(trim(inner.substr(pos, p - pos)));
+        pos = p + and_tok.size();
+    }
+
+    std::string result;
+    bool first = true;
+    for (const auto& a : authors) {
+        if (a.empty()) continue;
+        std::string norm = normalize_one_author(a);
+        if (!first) {
+            result += " and ";
+        }
+        result += norm;
+        first = false;
+    }
+    return result;
+}
+
+static std::string normalize_author_value(const std::string& raw) {
+    std::string s = trim(raw);
+    std::string inner;
+    if (s.size() >= 2 &&
+        ((s.front() == '{' && s.back() == '}') ||
+         (s.front() == '"' && s.back() == '"'))) {
+        inner = s.substr(1, s.size() - 2);
+    } else {
+        inner = s;
+    }
+
+    std::string canon_inner = normalize_author_inner(inner);
+    return "{" + canon_inner + "}";
+}
+
+static void init_field_order() {
+    if (!field_order_index.empty()) return;
+    for (std::size_t i = 0; i < FIELD_ORDER_COUNT; ++i) {
+        field_order_index[FIELD_ORDER[i]] = static_cast<int>(i);
+    }
+}
+
+static int field_rank_of(const std::string& name) {
+    const std::string lower = to_lower_copy(name);
+    auto it = field_order_index.find(lower);
+    if (it != field_order_index.end()) {
+        return it->second;
+    }
+    // Unknown fields come after all known ones, sorted by name.
+    return static_cast<int>(FIELD_ORDER_COUNT);
+}
+
+// --- Helpers for generating canonical entry IDs ---
+
+static std::string strip_outer_braces_or_quotes(const std::string& s) {
+    std::string t = trim(s);
+    if (t.size() >= 2 &&
+        ((t.front() == '{' && t.back() == '}') ||
+         (t.front() == '"' && t.back() == '"'))) {
+        return t.substr(1, t.size() - 2);
+    }
+    return t;
+}
+
+static std::string get_field_value(const BibEntry& e, const char* name) {
+    const std::string target = to_lower_copy(name);
+    for (const auto& kv : e.fields) {
+        if (to_lower_copy(kv.first) == target) {
+            return kv.second;
+        }
+    }
+    return std::string();
+}
+
+static bool is_title_stop_word(const std::string& w) {
+    static const char* STOP_WORDS[] = {
+        "a", "an", "the", "of", "in", "on", "and", "for", "to", "with",
+        "from", "by", "about", "into", "over", "after", "before", "between",
+        "without", "within"
+    };
+    static const std::size_t N = sizeof(STOP_WORDS) / sizeof(STOP_WORDS[0]);
+
+    const std::string lower = to_lower_copy(w);
+    for (std::size_t i = 0; i < N; ++i) {
+        if (lower == STOP_WORDS[i]) return true;
+    }
+    return false;
+}
+
+static std::string first_content_word_from_title(const std::string& raw_title) {
+    const std::string inner = strip_outer_braces_or_quotes(raw_title);
+    std::string word;
+    std::vector<std::string> words;
+    for (char ch : inner) {
+        if (std::isalnum(static_cast<unsigned char>(ch))) {
+            word.push_back(static_cast<char>(std::tolower(static_cast<unsigned char>(ch))));
+        } else {
+            if (!word.empty()) {
+                words.push_back(word);
+                word.clear();
+            }
+        }
+    }
+    if (!word.empty()) {
+        words.push_back(word);
+    }
+
+    for (const auto& w : words) {
+        if (!is_title_stop_word(w)) {
+            return w;
+        }
+    }
+    return words.empty() ? std::string() : words.front();
+}
+
+static std::string second_content_word_from_title(const std::string& raw_title) {
+    const std::string inner = strip_outer_braces_or_quotes(raw_title);
+    std::string word;
+    std::vector<std::string> words;
+    for (char ch : inner) {
+        if (std::isalnum(static_cast<unsigned char>(ch))) {
+            word.push_back(static_cast<char>(std::tolower(static_cast<unsigned char>(ch))));
+        } else {
+            if (!word.empty()) {
+                words.push_back(word);
+                word.clear();
+            }
+        }
+    }
+    if (!word.empty()) words.push_back(word);
+    int content_word_count = 0;
+    for (const auto& w : words) {
+        if (!is_title_stop_word(w)) {
+            ++content_word_count;
+            if (content_word_count == 2) return w;
+        }
+    }
+    return std::string();
+}
+
+static std::string lead_author_surname(const std::string& raw_author) {
+    std::string inner = strip_outer_braces_or_quotes(raw_author);
+    const std::string and_tok = " and ";
+    const std::string lower_inner = to_lower_copy(inner);
+    const std::string and_tok_lower = to_lower_copy(and_tok);
+    std::size_t p = lower_inner.find(and_tok_lower);
+    if (p != std::string::npos)
+        inner = inner.substr(0, p);
+    inner = trim(inner);
+    if (inner.empty()) return std::string();
+    // Inner is now canonical "First Last [Suffix]" from normalization.
+    std::string surname;
+    std::string current;
+    for (char ch : inner) {
+        if (std::isspace(static_cast<unsigned char>(ch))) {
+            if (!current.empty()) {
+                surname = current;
+                current.clear();
+            }
+        } else current.push_back(ch);
+    }
+    if (!current.empty())
+        surname = current;
+    // Remove problematic characters when forming IDs: "'.=\^{}~
+    const std::string bad_chars = "\"'.=\\^{}~";
+    std::string cleaned;
+    for (char ch : surname)
+        if (bad_chars.find(ch) == std::string::npos)
+            cleaned.push_back(ch);
+    return to_lower_copy(cleaned);
+}
+
+static std::string extract_year_digits(const std::string& raw_year) {
+    const std::string s = strip_outer_braces_or_quotes(raw_year);
+    std::string digits;
+    for (char ch : s) {
+        if (std::isdigit(static_cast<unsigned char>(ch)))
+            digits.push_back(ch);
+        else if (!digits.empty()) break;
+    }
+    if (digits.size() > 4) digits = digits.substr(0, 4);
+    return digits;
+}
+
+static bool is_year_only_date(const std::string& raw) {
+    const std::string s = strip_outer_braces_or_quotes(raw);
+    if (s.size() != 4) return false;
+    for (char ch : s)
+        if (!std::isdigit(static_cast<unsigned char>(ch))) return false;
+    return true;
+}
+
+// Forward declaration for helper used below.
+static bool has_field(const BibEntry& e, const std::string& name);
+
+// If `date` is of the form YYYY-MM and there are no
+// explicit `year` or `month` fields yet, replace that
+// single `date` field with separate `year` and `month`.
+static void split_year_month_from_date(BibEntry& e) {
+    if (has_field(e, "year") || has_field(e, "month"))
+        return;
+
+    auto it_date = e.fields.end();
+    for (auto it = e.fields.begin(); it != e.fields.end(); ++it) {
+        if (to_lower_copy(it->first) == "date") {
+            it_date = it;
+            break;
+        }
+    }
+    if (it_date == e.fields.end()) return;
+
+    const std::string inner = strip_outer_braces_or_quotes(it_date->second);
+    std::string year;
+    std::string month;
+
+    std::size_t i = 0;
+    while (i < inner.size() && std::isdigit(static_cast<unsigned char>(inner[i]))) {
+        year.push_back(inner[i]);
+        ++i;
+    }
+    if (year.size() != 4) return;
+    if (i >= inner.size() || inner[i] != '-') return;
+    ++i;
+    while (i < inner.size() && std::isdigit(static_cast<unsigned char>(inner[i]))) {
+        month.push_back(inner[i]);
+        ++i;
+    }
+    if (month.empty()) return;
+    if (i != inner.size()) return; // extra trailing characters, not pure YYYY-MM
+
+    // Basic month range check (1-12).
+    int month_num = std::atoi(month.c_str());
+    if (month_num < 1 || month_num > 12) return;
+
+    // All conditions satisfied: rewrite fields.
+    e.fields.erase(it_date);
+    e.fields["year"] = year;
+    // Store month without zero-padding (e.g. "01" -> "1").
+    e.fields["month"] = std::to_string(month_num);
+}
+
+// Normalize existing month fields so they are not zero-padded.
+static void normalize_month_in_entry(BibEntry& e) {
+    for (auto& kv : e.fields) {
+        if (to_lower_copy(kv.first) != "month") continue;
+        const std::string inner = strip_outer_braces_or_quotes(kv.second);
+        if (inner.empty()) continue;
+        bool all_digits = true;
+        for (char ch : inner) {
+            if (!std::isdigit(static_cast<unsigned char>(ch))) {
+                all_digits = false;
+                break;
+            }
+        }
+        if (!all_digits) continue;
+
+        // Strip leading zeros but leave a single zero if that's all there is.
+        std::size_t pos = 0;
+        while (pos + 1 < inner.size() && inner[pos] == '0') ++pos;
+        const std::string normalized = inner.substr(pos);
+        kv.second = normalized;
+    }
+}
+
+static bool has_field(const BibEntry& e, const std::string& name) {
+    const std::string target = to_lower_copy(name);
+    for (const auto& kv : e.fields)
+        if (to_lower_copy(kv.first) == target) return true;
+    return false;
+}
+
+static bool has_any_field(const BibEntry& e, const std::set<std::string>& names) {
+    for (const auto& kv : e.fields)
+        if (names.count(to_lower_copy(kv.first))) return true;
+    return false;
+}
+
+static std::string join(const std::vector<std::string>& v, const std::string& sep) {
+    std::string res;
+    for (std::size_t i = 0; i < v.size(); ++i) {
+        if (i > 0) res += sep;
+        res += v[i];
+    }
+    return res;
+}
+
+static bool check_required_fields(const BibEntry& e, std::string& error) {
+    std::string type = to_lower_copy(e.type);
+    std::vector<std::string> missing;
+
+    if (type == "article") {
+        /* TODO enable via a --strict flag
+        if (!has_any_field(e, REQUIRED_ARTICLE_JOURNAL_OR_HOWPUBLISHED))
+            missing.push_back("journal or howpublished");
+        */
+        for (const auto& f : REQUIRED_ARTICLE_FIELDS)
+            if (!has_field(e, f)) missing.push_back(f);
+    } else if (type == "book") {
+        if (!has_any_field(e, REQUIRED_BOOK_AUTHOR_OR_EDITOR))
+            missing.push_back("author or editor");
+        for (const auto& f : REQUIRED_BOOK_FIELDS)
+            if (!has_field(e, f)) missing.push_back(f);
+    } else if (type == "inproceedings") {
+        for (const auto& f : REQUIRED_INPROC_FIELDS)
+            if (!has_field(e, f)) missing.push_back(f);
+    } else if (type == "incollection") {
+        for (const auto& f : REQUIRED_INCOLL_FIELDS)
+            if (!has_field(e, f)) missing.push_back(f);
+    } else if (type == "phdthesis" || type == "mastersthesis") {
+        for (const auto& f : REQUIRED_THESIS_FIELDS)
+            if (!has_field(e, f)) missing.push_back(f);
+    } else if (type == "techreport") {
+        for (const auto& f : REQUIRED_TECHREPORT_FIELDS)
+            if (!has_field(e, f)) missing.push_back(f);
+    } else if (type == "booklet") {
+        for (const auto& f : REQUIRED_BOOKLET_FIELDS)
+            if (!has_field(e, f)) missing.push_back(f);
+    }
+
+    if (!missing.empty()) {
+        error = join(missing, ", ");
+        return false;
+    }
+    return true;
+}
+
+static std::string generate_id(const BibEntry& e) {
+    const std::string title = get_field_value(e, "title");
+    const std::string author = get_field_value(e, "author");
+    const std::string year = get_field_value(e, "year");
+    const std::string word = first_content_word_from_title(title);
+    const std::string second_word = second_content_word_from_title(title);
+    std::string keywords;
+    if (second_word.empty())
+        keywords = word;
+    else {
+
+        // Append second word to first word to reduce collisions.
+        // But make first letter of second word uppercase to improve readability.
+        keywords = word + static_cast<char>(
+            std::toupper(static_cast<unsigned char>(second_word[0]))
+        ) + second_word.substr(1);
+    }
+    const std::string surname = lead_author_surname(author);
+    const std::string year_digits = extract_year_digits(year);
+    if (keywords.empty() || year_digits.empty())
+        return e.key;
+    if (surname.empty())
+        return keywords + year_digits;
+    return surname + year_digits + keywords;
+}
+
+static bool contains_advertising_link(const std::string& raw) {
+    const std::string inner = strip_outer_braces_or_quotes(raw);
+    const std::string lower = to_lower_copy(inner);
+    static const char* BAD_DOMAINS[] = {
+        "books.google.",
+        "jstor.org",
+        "researchgate.net",
+        "openresearchlibrary.org",
+        "semanticscholar.org"
+    };
+    static const std::size_t N = sizeof(BAD_DOMAINS) / sizeof(BAD_DOMAINS[0]);
+    for (std::size_t i = 0; i < N; ++i) {
+        if (lower.find(BAD_DOMAINS[i]) != std::string::npos) return true;
+    }
+    return false;
+}
+
+static bool should_suppress_field(const std::string& name, const std::string& value) {
+    const std::string lower_name = to_lower_copy(name);
+    if (lower_name == "url" || lower_name == "note")
+        if (contains_advertising_link(value)) return true;
+    return false;
+}
+
+static void print_wrapped_field(const std::string& name, const std::string& value) {
+    // Short or unbraced: print on one line.
+    if (value.size() == 0 || value.front() != '{' || value.back() != '}') {
+        if (value.front() == '"' && value.back() == '"') {
+            const std::string inner = value.substr(1, value.size() - 2);
+            std::cout << "\t" << name << "={" << inner << "}";
+        } else std::cout << "\t" << name << "={" << value << "}";
+    } else if (value.size() <= WRAP_COLUMNS) {
+        std::cout << "\t" << name << "=" << value;
+    } else {
+        const std::string inner = value.substr(1, value.size() - 2);
+        // Open brace on its own line after the field name.
+        std::cout << "\t" << name << "={\n";
+        std::size_t pos = 0;
+        const std::size_t n = inner.size();
+        while (pos < n) {
+            while (pos < n && inner[pos] == ' ')
+                ++pos;
+            if (pos >= n) break;
+            std::size_t end = pos + WRAP_COLUMNS;
+            if (end >= n)
+                end = n;
+            else {
+                std::size_t space_pos = inner.rfind(' ', end);
+                if (space_pos != std::string::npos && space_pos > pos)
+                    end = space_pos;
+            }
+            std::string line = inner.substr(pos, end - pos);
+            std::size_t first_non_space = line.find_first_not_of(' ');
+            if (first_non_space != std::string::npos)
+                line.erase(0, first_non_space);
+            std::cout << "\t\t" << line << "\n";
+            pos = end;
+        }
+        // Closing brace on its own line, indented once.
+        std::cout << "\t}";
+    }
+}
+%}
+
+%code requires {
+    #include <map>
+    #include <string>
+
+    #ifndef YYLTYPE_IS_DECLARED
+    #define YYLTYPE_IS_DECLARED 1
+    typedef struct YYLTYPE {
+        int first_line;
+        int first_column;
+        int last_line;
+        int last_column;
+    } YYLTYPE;
+    #endif
+}
+
+%locations
+
+%union {
+    std::string* str;
+    std::map<std::string, std::string>* fieldmap;
+}
+
+%token <str> IDENT STRING NUMBER
+%token AT LBRACE RBRACE COMMA EQUALS HASH
+%type <fieldmap> fields field
+%type <str> value
+
+%%
+bibtex : entries { /* done */ }
+;
+
+entries : entries entry
+        | entry
+;
+
+entry : AT IDENT LBRACE IDENT COMMA fields RBRACE {
+    BibEntry e;
+    e.type = *$2;
+    e.key = *$4;
+    e.fields = *$6;
+    entries.push_back(e);
+    delete $2; delete $4; delete $6;
+}
+;
+
+fields : fields COMMA field { $$ = $1; $$->insert($3->begin(), $3->end()); delete $3; }
+       | field { $$ = $1; }
+       // Allow trailing comma
+       | fields COMMA { $$ = $1; }
+;
+
+field : IDENT EQUALS value {
+    $$ = new std::map<std::string, std::string>();
+    std::string name = *$1;
+    std::string value = *$3;
+    if (to_lower_copy(name) == "author")
+        value = normalize_author_value(value);
+    if (to_lower_copy(name) == "date" && is_year_only_date(value))
+        name = "year";
+    if (to_lower_copy(name) == "journaltitle")
+        name = "journal";
+    (*$$)[name] = value;
+    delete $1; delete $3;
+}
+;
+
+value : STRING { $$ = $1; }
+      | NUMBER { $$ = $1; }
+      | IDENT  { $$ = $1; }
+;
+
+%%
+
+void yyerror(const char* s) {
+    extern YYLTYPE yylloc;
+    std::cerr << "parse error at " << yylloc.first_line << ":" << yylloc.first_column << ": " << s << "\n";
+}
+
+int main() {
+    init_field_order();
+    yyparse();
+    // Normalize date/month fields
+    for (auto& e : entries) {
+        split_year_month_from_date(e);
+        normalize_month_in_entry(e);
+    }
+    // Check required fields for each entry
+    for (const auto& e : entries) {
+        std::string error;
+        if (!check_required_fields(e, error)) {
+            std::cerr << "@" << e.type << "{" << e.key << "} requires fields: " << error << std::endl;
+            return 1;
+        }
+    }
+    for (const auto& e : entries) {
+        std::string type_l = to_lower_copy(e.type);
+        const std::string id = generate_id(e);
+        std::cout << "@" << type_l << "{" << id;
+        // Copy fields into a vector and sort by preferred order, then name.
+        std::vector<std::pair<std::string, std::string>> ordered_fields(
+            e.fields.begin(), e.fields.end());
+        std::sort(ordered_fields.begin(), ordered_fields.end(),
+                  [](const auto& a, const auto& b) {
+                      const int ra = field_rank_of(a.first);
+                      const int rb = field_rank_of(b.first);
+                      if (ra != rb) return ra < rb;
+                      return to_lower_copy(a.first) < to_lower_copy(b.first);
+                  });
+
+        for (const auto& f : ordered_fields) {
+            if (should_suppress_field(f.first, f.second)) continue;
+            std::string field_l = to_lower_copy(f.first);
+            std::cout << ",\n";
+            print_wrapped_field(field_l, f.second);
+        }
+        std::cout << "\n}\n";
+    }
+    return 0;
+}
--- a/test/A_Theory_of_Justice.bibtex
+++ b/test/A_Theory_of_Justice.bibtex
@@ -0,0 +1,9 @@
+@book{rawls2020theory,
+  title={A {Theory} of Justice: Revised Edition},
+  % The author
+  author="R\\{a}wls, J. AND Seán Healy", % the rest
+  isbn={9780674257672},
+  url={https://books.google.ie/books?id=cngvEAAAQBAJ},
+  year={2020},
+  publisher={Harvard University Press}
+}
--- a/test/big_file.bib
+++ b/test/big_file.bib
--- a/test/sample.bib
+++ b/test/sample.bib
@@ -0,0 +1,13 @@
+@Book{x:aristotle1968poetics,
+	author={Aristotle},
+	title={Poetics},
+	date=1968,
+	editor={David W. Lucas},
+	series={Clarendon Aristotle},
+	publisher={Clarendon Press},
+	location={Oxford},
+	keywords={primary},
+	langid={english},
+	langidopts={variant=british},
+	shorttitle={Poetics}
+}