Add README, LICENSE, Makefile + use tabs
This commit is contained in:
225
recover-pdfs.c
225
recover-pdfs.c
@@ -3,131 +3,124 @@
|
||||
#include <string.h>
|
||||
#include <sys/stat.h>
|
||||
|
||||
#define CHUNK 4096
|
||||
#define MAX_PDF_SIZE (100LL * 1024 * 1024) // 100 MB
|
||||
#define CHUNK (1024 * 1024) // 1 MB buffer
|
||||
#define MAX_PDF_SIZE (100LL * 1024 * 1024)
|
||||
|
||||
int valid_pdf_header(const unsigned char *buf, size_t len) {
|
||||
if (len < 9) return 0;
|
||||
typedef enum {
|
||||
SCANNING,
|
||||
IN_PDF
|
||||
} State;
|
||||
|
||||
if (memcmp(buf, "%PDF-", 5) != 0)
|
||||
return 0;
|
||||
|
||||
if (buf[5] < '0' || buf[5] > '9') return 0;
|
||||
if (buf[6] != '.') return 0;
|
||||
if (buf[7] < '0' || buf[7] > '9') return 0;
|
||||
|
||||
size_t i = 8;
|
||||
|
||||
if (buf[i] >= '0' && buf[i] <= '9') i++;
|
||||
|
||||
if (buf[i] == '\r' && buf[i + 1] == '\n') return 1;
|
||||
if (buf[i] == '\n') return 1;
|
||||
|
||||
return 0;
|
||||
static int valid_pdf_header(const unsigned char *p, size_t len) {
|
||||
if (len < 10) return 0;
|
||||
if (p[0] != '%' || p[1] != 'P' || p[2] != 'D' || p[3] != 'F' || p[4] != '-')
|
||||
return 0;
|
||||
if (p[5] < '0' || p[5] > '9') return 0;
|
||||
if (p[6] != '.') return 0;
|
||||
if (p[7] < '0' || p[7] > '9') return 0;
|
||||
size_t i = 8;
|
||||
if (p[i] >= '0' && p[i] <= '9') i++;
|
||||
return (p[i] == '\n' || (p[i] == '\r' && p[i + 1] == '\n'));
|
||||
}
|
||||
|
||||
size_t find_eof(const unsigned char *buf, size_t len) {
|
||||
for (size_t i = 0; i + 5 <= len; i++) {
|
||||
if (memcmp(buf + i, "%%EOF", 5) == 0)
|
||||
return i;
|
||||
}
|
||||
return (size_t)-1;
|
||||
}
|
||||
|
||||
void write_pdf(const unsigned char *data, size_t len, int idx, const char *outdir) {
|
||||
char path[512];
|
||||
snprintf(path, sizeof(path), "%s/pdf_%04d.pdf", outdir, idx);
|
||||
|
||||
FILE *f = fopen(path, "wb");
|
||||
if (!f) return;
|
||||
|
||||
fwrite(data, 1, len, f);
|
||||
fclose(f);
|
||||
static void make_outfile(char *path, size_t sz, const char *dir, int idx) {
|
||||
snprintf(path, sz, "%s/pdf_%04d.pdf", dir, idx);
|
||||
}
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
if (argc != 3) {
|
||||
fprintf(stderr, "Usage: %s <image_file> <output_dir>\n", argv[0]);
|
||||
return 1;
|
||||
}
|
||||
|
||||
const char *infile = argv[1];
|
||||
const char *outdir = argv[2];
|
||||
|
||||
mkdir(outdir, 0755);
|
||||
|
||||
if (strcmp(infile, "-") == 0) {
|
||||
if (argc != 3) {
|
||||
fprintf(stderr, "Usage: %s <image_file> <output_dir>\n", argv[0]);
|
||||
return 1;
|
||||
}
|
||||
char *infile = argv[1];
|
||||
if (infile[0] == '-' && infile[1] == '\0') {
|
||||
infile = "/dev/stdin";
|
||||
}
|
||||
FILE *f = fopen(infile, "rb");
|
||||
if (!f) {
|
||||
perror("open input");
|
||||
return 1;
|
||||
}
|
||||
const char *outdir = argv[2];
|
||||
mkdir(outdir, 0755);
|
||||
FILE *in = fopen(infile, "rb");
|
||||
if (!in) {
|
||||
perror("open input");
|
||||
return 1;
|
||||
}
|
||||
unsigned char *buf = malloc(CHUNK);
|
||||
if (!buf) return 1;
|
||||
State state = SCANNING;
|
||||
FILE *out = NULL;
|
||||
unsigned char eof_win[5] = {0}; // rolling "%%EOF"
|
||||
size_t pdf_size = 0;
|
||||
int pdf_count = 0;
|
||||
// overlap buffer to catch split headers across chunks
|
||||
unsigned char overlap[16];
|
||||
size_t overlap_len = 0;
|
||||
while (1) {
|
||||
size_t r = fread(buf, 1, CHUNK, in);
|
||||
if (r == 0) break;
|
||||
size_t i = 0;
|
||||
// prepend overlap
|
||||
unsigned char *data = buf;
|
||||
size_t len = r;
|
||||
unsigned char temp[CHUNK + 16];
|
||||
if (overlap_len > 0) {
|
||||
memcpy(temp, overlap, overlap_len);
|
||||
memcpy(temp + overlap_len, buf, r);
|
||||
data = temp;
|
||||
len = r + overlap_len;
|
||||
i = 0;
|
||||
}
|
||||
overlap_len = 0;
|
||||
for (; i < len; i++) {
|
||||
if (state == SCANNING) {
|
||||
// only attempt match on '%'
|
||||
if (data[i] == '%' && i + 10 < len) {
|
||||
if (valid_pdf_header(&data[i], len - i)) {
|
||||
char path[512];
|
||||
make_outfile(path, sizeof(path), outdir, pdf_count++);
|
||||
out = fopen(path, "wb");
|
||||
if (!out) {
|
||||
state = SCANNING;
|
||||
continue;
|
||||
}
|
||||
state = IN_PDF;
|
||||
pdf_size = 0;
|
||||
|
||||
unsigned char *buf = malloc(CHUNK * 2);
|
||||
size_t buf_len = 0;
|
||||
memset(eof_win, 0, sizeof(eof_win));
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (state == IN_PDF) {
|
||||
if (pdf_size >= MAX_PDF_SIZE) {
|
||||
fclose(out);
|
||||
out = NULL;
|
||||
state = SCANNING;
|
||||
continue;
|
||||
}
|
||||
fputc(data[i], out);
|
||||
pdf_size++;
|
||||
// rolling EOF detector
|
||||
eof_win[0] = eof_win[1];
|
||||
eof_win[1] = eof_win[2];
|
||||
eof_win[2] = eof_win[3];
|
||||
eof_win[3] = eof_win[4];
|
||||
eof_win[4] = data[i];
|
||||
if (eof_win[0] == '%' &&
|
||||
eof_win[1] == '%' &&
|
||||
eof_win[2] == 'E' &&
|
||||
eof_win[3] == 'O' &&
|
||||
eof_win[4] == 'F') {
|
||||
fclose(out);
|
||||
out = NULL;
|
||||
state = SCANNING;
|
||||
}
|
||||
}
|
||||
}
|
||||
// save last 16 bytes for boundary-safe matching
|
||||
overlap_len = (len < 16) ? len : 16;
|
||||
memcpy(overlap, data + (len - overlap_len), overlap_len);
|
||||
}
|
||||
if (out) fclose(out);
|
||||
free(buf);
|
||||
fclose(in);
|
||||
|
||||
unsigned char *pdf_buf = NULL;
|
||||
size_t pdf_size = 0;
|
||||
|
||||
int collecting = 0;
|
||||
int pdf_count = 0;
|
||||
|
||||
while (!feof(f)) {
|
||||
size_t r = fread(buf + buf_len, 1, CHUNK, f);
|
||||
buf_len += r;
|
||||
|
||||
size_t i = 0;
|
||||
|
||||
while (i < buf_len) {
|
||||
|
||||
if (!collecting) {
|
||||
if (i + 10 < buf_len && valid_pdf_header(buf + i, buf_len - i)) {
|
||||
collecting = 1;
|
||||
pdf_buf = malloc(1024);
|
||||
pdf_size = 0;
|
||||
}
|
||||
}
|
||||
|
||||
if (collecting) {
|
||||
|
||||
// HARD SIZE LIMIT CHECK
|
||||
if (pdf_size >= MAX_PDF_SIZE) {
|
||||
free(pdf_buf);
|
||||
pdf_buf = NULL;
|
||||
pdf_size = 0;
|
||||
collecting = 0;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (pdf_size % 1024 == 0) {
|
||||
pdf_buf = realloc(pdf_buf, pdf_size + 1024);
|
||||
}
|
||||
|
||||
pdf_buf[pdf_size++] = buf[i];
|
||||
|
||||
size_t eof_pos = find_eof(pdf_buf, pdf_size);
|
||||
if (eof_pos != (size_t)-1) {
|
||||
size_t end = eof_pos + 5;
|
||||
|
||||
write_pdf(pdf_buf, end, pdf_count++, outdir);
|
||||
|
||||
free(pdf_buf);
|
||||
pdf_buf = NULL;
|
||||
pdf_size = 0;
|
||||
collecting = 0;
|
||||
}
|
||||
}
|
||||
|
||||
i++;
|
||||
}
|
||||
|
||||
buf_len = 0;
|
||||
}
|
||||
|
||||
free(buf);
|
||||
fclose(f);
|
||||
|
||||
return 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user