Files
recover-pdfs/recover-pdfs.c

127 lines
3.0 KiB
C

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/stat.h>
#define CHUNK (1024 * 1024) // 1 MB buffer
#define MAX_PDF_SIZE (100LL * 1024 * 1024)
typedef enum {
SCANNING,
IN_PDF
} State;
static int valid_pdf_header(const unsigned char *p, size_t len) {
if (len < 10) return 0;
if (p[0] != '%' || p[1] != 'P' || p[2] != 'D' || p[3] != 'F' || p[4] != '-')
return 0;
if (p[5] < '0' || p[5] > '9') return 0;
if (p[6] != '.') return 0;
if (p[7] < '0' || p[7] > '9') return 0;
size_t i = 8;
if (p[i] >= '0' && p[i] <= '9') i++;
return (p[i] == '\n' || (p[i] == '\r' && p[i + 1] == '\n'));
}
static void make_outfile(char *path, size_t sz, const char *dir, int idx) {
snprintf(path, sz, "%s/pdf_%04d.pdf", dir, idx);
}
int main(int argc, char **argv) {
if (argc != 3) {
fprintf(stderr, "Usage: %s <image_file> <output_dir>\n", argv[0]);
return 1;
}
char *infile = argv[1];
if (infile[0] == '-' && infile[1] == '\0') {
infile = "/dev/stdin";
}
const char *outdir = argv[2];
mkdir(outdir, 0755);
FILE *in = fopen(infile, "rb");
if (!in) {
perror("open input");
return 1;
}
unsigned char *buf = malloc(CHUNK);
if (!buf) return 1;
State state = SCANNING;
FILE *out = NULL;
unsigned char eof_win[5] = {0}; // rolling "%%EOF"
size_t pdf_size = 0;
int pdf_count = 0;
// overlap buffer to catch split headers across chunks
unsigned char overlap[16];
size_t overlap_len = 0;
while (1) {
size_t r = fread(buf, 1, CHUNK, in);
if (r == 0) break;
size_t i = 0;
// prepend overlap
unsigned char *data = buf;
size_t len = r;
unsigned char temp[CHUNK + 16];
if (overlap_len > 0) {
memcpy(temp, overlap, overlap_len);
memcpy(temp + overlap_len, buf, r);
data = temp;
len = r + overlap_len;
i = 0;
}
overlap_len = 0;
for (; i < len; i++) {
if (state == SCANNING) {
// only attempt match on '%'
if (data[i] == '%' && i + 10 < len) {
if (valid_pdf_header(&data[i], len - i)) {
char path[512];
make_outfile(path, sizeof(path), outdir, pdf_count++);
out = fopen(path, "wb");
if (!out) {
state = SCANNING;
continue;
}
state = IN_PDF;
pdf_size = 0;
memset(eof_win, 0, sizeof(eof_win));
}
}
}
else if (state == IN_PDF) {
if (pdf_size >= MAX_PDF_SIZE) {
fclose(out);
out = NULL;
state = SCANNING;
continue;
}
fputc(data[i], out);
pdf_size++;
// rolling EOF detector
eof_win[0] = eof_win[1];
eof_win[1] = eof_win[2];
eof_win[2] = eof_win[3];
eof_win[3] = eof_win[4];
eof_win[4] = data[i];
if (eof_win[0] == '%' &&
eof_win[1] == '%' &&
eof_win[2] == 'E' &&
eof_win[3] == 'O' &&
eof_win[4] == 'F') {
fclose(out);
out = NULL;
state = SCANNING;
}
}
}
// save last 16 bytes for boundary-safe matching
overlap_len = (len < 16) ? len : 16;
memcpy(overlap, data + (len - overlap_len), overlap_len);
}
if (out) fclose(out);
free(buf);
fclose(in);
return 0;
}