#include #include #include #include #define CHUNK 4096 #define MAX_PDF_SIZE (100LL * 1024 * 1024) // 100 MB int valid_pdf_header(const unsigned char *buf, size_t len) { if (len < 9) return 0; if (memcmp(buf, "%PDF-", 5) != 0) return 0; if (buf[5] < '0' || buf[5] > '9') return 0; if (buf[6] != '.') return 0; if (buf[7] < '0' || buf[7] > '9') return 0; size_t i = 8; if (buf[i] >= '0' && buf[i] <= '9') i++; if (buf[i] == '\r' && buf[i + 1] == '\n') return 1; if (buf[i] == '\n') return 1; return 0; } size_t find_eof(const unsigned char *buf, size_t len) { for (size_t i = 0; i + 5 <= len; i++) { if (memcmp(buf + i, "%%EOF", 5) == 0) return i; } return (size_t)-1; } void write_pdf(const unsigned char *data, size_t len, int idx, const char *outdir) { char path[512]; snprintf(path, sizeof(path), "%s/pdf_%04d.pdf", outdir, idx); FILE *f = fopen(path, "wb"); if (!f) return; fwrite(data, 1, len, f); fclose(f); } int main(int argc, char **argv) { if (argc != 3) { fprintf(stderr, "Usage: %s \n", argv[0]); return 1; } const char *infile = argv[1]; const char *outdir = argv[2]; mkdir(outdir, 0755); if (strcmp(infile, "-") == 0) { infile = "/dev/stdin"; } FILE *f = fopen(infile, "rb"); if (!f) { perror("open input"); return 1; } unsigned char *buf = malloc(CHUNK * 2); size_t buf_len = 0; unsigned char *pdf_buf = NULL; size_t pdf_size = 0; int collecting = 0; int pdf_count = 0; while (!feof(f)) { size_t r = fread(buf + buf_len, 1, CHUNK, f); buf_len += r; size_t i = 0; while (i < buf_len) { if (!collecting) { if (i + 10 < buf_len && valid_pdf_header(buf + i, buf_len - i)) { collecting = 1; pdf_buf = malloc(1024); pdf_size = 0; } } if (collecting) { // HARD SIZE LIMIT CHECK if (pdf_size >= MAX_PDF_SIZE) { free(pdf_buf); pdf_buf = NULL; pdf_size = 0; collecting = 0; continue; } if (pdf_size % 1024 == 0) { pdf_buf = realloc(pdf_buf, pdf_size + 1024); } pdf_buf[pdf_size++] = buf[i]; size_t eof_pos = find_eof(pdf_buf, pdf_size); if (eof_pos != (size_t)-1) { size_t end = eof_pos + 5; write_pdf(pdf_buf, end, pdf_count++, outdir); free(pdf_buf); pdf_buf = NULL; pdf_size = 0; collecting = 0; } } i++; } buf_len = 0; } free(buf); fclose(f); return 0; }