diff --git a/recover-pdfs.c b/recover-pdfs.c index ba530fa..d77156c 100644 --- a/recover-pdfs.c +++ b/recover-pdfs.c @@ -4,17 +4,31 @@ #include #define CHUNK 4096 +#define MAX_PDF_SIZE (100LL * 1024 * 1024) // 100 MB -int match_at(const unsigned char *buf, size_t len, const char *pat) { - size_t p = strlen(pat); - if (len < p) return 0; - return memcmp(buf, pat, p) == 0; +int valid_pdf_header(const unsigned char *buf, size_t len) { + if (len < 9) return 0; + + if (memcmp(buf, "%PDF-", 5) != 0) + return 0; + + if (buf[5] < '0' || buf[5] > '9') return 0; + if (buf[6] != '.') return 0; + if (buf[7] < '0' || buf[7] > '9') return 0; + + size_t i = 8; + + if (buf[i] >= '0' && buf[i] <= '9') i++; + + if (buf[i] == '\r' && buf[i + 1] == '\n') return 1; + if (buf[i] == '\n') return 1; + + return 0; } -size_t find_in_buffer(const unsigned char *buf, size_t len, const char *pat) { - size_t p = strlen(pat); - for (size_t i = 0; i + p <= len; i++) { - if (memcmp(buf + i, pat, p) == 0) +size_t find_eof(const unsigned char *buf, size_t len) { + for (size_t i = 0; i + 5 <= len; i++) { + if (memcmp(buf + i, "%%EOF", 5) == 0) return i; } return (size_t)-1; @@ -42,6 +56,9 @@ int main(int argc, char **argv) { mkdir(outdir, 0755); + if (strcmp(infile, "-") == 0) { + infile = "/dev/stdin"; + } FILE *f = fopen(infile, "rb"); if (!f) { perror("open input"); @@ -50,11 +67,12 @@ int main(int argc, char **argv) { unsigned char *buf = malloc(CHUNK * 2); size_t buf_len = 0; - int pdf_count = 0; - unsigned char *file_data = NULL; - size_t file_size = 0; + unsigned char *pdf_buf = NULL; + size_t pdf_size = 0; + int collecting = 0; + int pdf_count = 0; while (!feof(f)) { size_t r = fread(buf + buf_len, 1, CHUNK, f); @@ -65,28 +83,40 @@ int main(int argc, char **argv) { while (i < buf_len) { if (!collecting) { - if (i + 5 < buf_len && memcmp(buf + i, "%PDF-", 5) == 0) { + if (i + 10 < buf_len && valid_pdf_header(buf + i, buf_len - i)) { collecting = 1; - file_data = malloc(1024); - file_size = 0; + pdf_buf = malloc(1024); + pdf_size = 0; } } if (collecting) { - if (file_size % 1024 == 0) { - file_data = realloc(file_data, file_size + 1024); + + // HARD SIZE LIMIT CHECK + if (pdf_size >= MAX_PDF_SIZE) { + free(pdf_buf); + pdf_buf = NULL; + pdf_size = 0; + collecting = 0; + continue; } - file_data[file_size++] = buf[i]; + if (pdf_size % 1024 == 0) { + pdf_buf = realloc(pdf_buf, pdf_size + 1024); + } - if (file_size > 6) { - if (memmem(file_data, file_size, "%%EOF", 5)) { - write_pdf(file_data, file_size, pdf_count++, outdir); - free(file_data); - file_data = NULL; - file_size = 0; - collecting = 0; - } + pdf_buf[pdf_size++] = buf[i]; + + size_t eof_pos = find_eof(pdf_buf, pdf_size); + if (eof_pos != (size_t)-1) { + size_t end = eof_pos + 5; + + write_pdf(pdf_buf, end, pdf_count++, outdir); + + free(pdf_buf); + pdf_buf = NULL; + pdf_size = 0; + collecting = 0; } }