#include #include #include #include #define CHUNK (1024 * 1024) // 1 MB buffer #define MAX_PDF_SIZE (100LL * 1024 * 1024) typedef enum { SCANNING, IN_PDF } State; static int valid_pdf_header(const unsigned char *p, size_t len) { if (len < 10) return 0; if (p[0] != '%' || p[1] != 'P' || p[2] != 'D' || p[3] != 'F' || p[4] != '-') return 0; if (p[5] < '0' || p[5] > '9') return 0; if (p[6] != '.') return 0; if (p[7] < '0' || p[7] > '9') return 0; size_t i = 8; if (p[i] >= '0' && p[i] <= '9') i++; return (p[i] == '\n' || (p[i] == '\r' && p[i + 1] == '\n')); } static void make_outfile(char *path, size_t sz, const char *dir, int idx) { snprintf(path, sz, "%s/pdf_%04d.pdf", dir, idx); } int main(int argc, char **argv) { if (argc != 3) { fprintf(stderr, "Usage: %s \n", argv[0]); return 1; } char *infile = argv[1]; if (infile[0] == '-' && infile[1] == '\0') { infile = "/dev/stdin"; } const char *outdir = argv[2]; mkdir(outdir, 0755); FILE *in = fopen(infile, "rb"); if (!in) { perror("open input"); return 1; } unsigned char *buf = malloc(CHUNK); if (!buf) return 1; State state = SCANNING; FILE *out = NULL; unsigned char eof_win[5] = {0}; // rolling "%%EOF" size_t pdf_size = 0; int pdf_count = 0; // overlap buffer to catch split headers across chunks unsigned char overlap[16]; size_t overlap_len = 0; while (1) { size_t r = fread(buf, 1, CHUNK, in); if (r == 0) break; size_t i = 0; // prepend overlap unsigned char *data = buf; size_t len = r; unsigned char temp[CHUNK + 16]; if (overlap_len > 0) { memcpy(temp, overlap, overlap_len); memcpy(temp + overlap_len, buf, r); data = temp; len = r + overlap_len; i = 0; } overlap_len = 0; for (; i < len; i++) { if (state == SCANNING) { // only attempt match on '%' if (data[i] == '%' && i + 10 < len) { if (valid_pdf_header(&data[i], len - i)) { char path[512]; make_outfile(path, sizeof(path), outdir, pdf_count++); out = fopen(path, "wb"); if (!out) { state = SCANNING; continue; } state = IN_PDF; pdf_size = 0; memset(eof_win, 0, sizeof(eof_win)); } } } else if (state == IN_PDF) { if (pdf_size >= MAX_PDF_SIZE) { fclose(out); out = NULL; state = SCANNING; continue; } fputc(data[i], out); pdf_size++; // rolling EOF detector eof_win[0] = eof_win[1]; eof_win[1] = eof_win[2]; eof_win[2] = eof_win[3]; eof_win[3] = eof_win[4]; eof_win[4] = data[i]; if (eof_win[0] == '%' && eof_win[1] == '%' && eof_win[2] == 'E' && eof_win[3] == 'O' && eof_win[4] == 'F') { fclose(out); out = NULL; state = SCANNING; } } } // save last 16 bytes for boundary-safe matching overlap_len = (len < 16) ? len : 16; memcpy(overlap, data + (len - overlap_len), overlap_len); } if (out) fclose(out); free(buf); fclose(in); return 0; }