#!/usr/bin/env python3 # pdfspotthediff-20251230 # djb # SPDX-License-Identifier: LicenseRef-PD-hp OR CC0-1.0 OR 0BSD OR MIT-0 OR MIT # dependencies: apt install python3-fitz (new name: python3-pymupdf) # usage: pdfspotthediff old.pdf new.pdf # output: old-spotthediff.pdf and new-spotthediff.pdf # (overwriting those if they exist already) # some caveats: # * does not mark changes in positioning, color, font, glyphs, images # * for highlighting, does not avoid colors already used in document # comparison to (some) alternatives: # * diff-pdf-wx: visual differences; each page separately; comparison window # * diffpdf: word differences; each page separately; comparison window # * https://github.com/malavika-suresh/multiple_pdf_comparison: word differences; each page separately; comparison PDF # * https://github.com/cascremers/pdfdiff: sentence differences; crosses pages; vimdiff comparison # * https://github.com/ssibb/PDF-Diff-Viewer: word differences; crosses pages; comparison window # * this tool: pdfspotthediff: word differences; crosses pages; two comparison PDFs tag2color = { 'delete': (1.0,0.8,0.8), # light red in old-spotthediff.pdf for removed text 'insert': (0.8,1.0,1.0), # light blue in new-spotthediff.pdf for added text 'replace': (1.0,1.0,0.2), # light yellow in both for modified text } notecolor = 0.8,1.0,0.8 # light green for words before+after blue/red in other PDF import sys import difflib try: import pymupdf except: import fitz as pymupdf pdf1 = pymupdf.open(sys.argv[1]) pdf2 = pymupdf.open(sys.argv[2]) words1 = [w[4] for page in pdf1 for w in page.get_text('words',sort=False)] words2 = [w[4] for page in pdf2 for w in page.get_text('words',sort=False)] diffs = difflib.SequenceMatcher(None,words1,words2,autojunk=False) pos2color1 = {} pos2color2 = {} for tag,i1,i2,j1,j2 in diffs.get_opcodes(): if tag == 'equal': continue for i in range(i1,i2): pos2color1[i] = tag2color[tag] if i1 == i2: pos2color1[i1-0.5] = notecolor pos2color1[i1+0.5] = notecolor for j in range(j1,j2): pos2color2[j] = tag2color[tag] if j1 == j2: pos2color2[j1-0.5] = notecolor pos2color2[j1+0.5] = notecolor def highlight(pdf,pos2color): pos = 0 for page in pdf: for w in page.get_text('words',sort=False): for offset in 0,0.5: if pos+offset in pos2color: annotation = page.add_highlight_annot(pymupdf.Rect(w[:4])) annotation.set_colors(stroke=pos2color[pos+offset]) annotation.update() pos += 1 highlight(pdf1,pos2color1) highlight(pdf2,pos2color2) for pdf,fn in (pdf1,sys.argv[1]),(pdf2,sys.argv[2]): if fn.endswith('.pdf'): fn = fn[:-4] pdf.save(fn+'-spotthediff.pdf')