verify.go

v0.1.0
Doc Versions Source
1
package main
2
3
import (
4
	"fmt"
5
	"io"
6
	"os"
7
	"regexp"
8
	"strings"
9
10
	"github.com/spf13/cobra"
11
12
	"go.bigb.es/confluence-md-utilities/converter"
13
	"go.bigb.es/confluence-md-utilities/format"
14
)
15
16
var (
17
	verifyIndent string
18
)
19
20
var verifyCmd = &cobra.Command{
21
	Use:   "verify [input.xml]",
22
	Short: "Verify round-trip fidelity of XML → Markdown → XML conversion",
23
	Long: `Check that Confluence XML survives a round-trip through Markdown and back.
24
25
Compares:
26
  A = fmt(input XML)
27
  B = fmt(xml2md(input XML) → md2xml → XML)
28
29
If A and B match, the round-trip is lossless. Otherwise, prints a diff.
30
31
Reads from stdin if no file is specified.`,
32
	Args: cobra.MaximumNArgs(1),
33
	RunE: func(cmd *cobra.Command, args []string) error {
34
		var input []byte
35
		var err error
36
37
		if len(args) > 0 {
38
			input, err = os.ReadFile(args[0])
39
		} else {
40
			input, err = io.ReadAll(os.Stdin)
41
		}
42
		if err != nil {
43
			return fmt.Errorf("reading input: %w", err)
44
		}
45
46
		xmlInput := string(input)
47
48
		// Normalize input: remove elements that cannot survive round-trip
49
		xmlInput = normalizeForVerify(xmlInput)
50
51
		// A: format the original XML
52
		formatted := format.PrettyXML(xmlInput, verifyIndent)
53
54
		// B: XML → Markdown → XML → format
55
		md, err := converter.ConfluenceToMarkdown(xmlInput)
56
		if err != nil {
57
			return fmt.Errorf("xml→markdown: %w", err)
58
		}
59
60
		xmlRoundTrip, err := converter.MarkdownToConfluence([]byte(md))
61
		if err != nil {
62
			return fmt.Errorf("markdown→xml: %w", err)
63
		}
64
65
		formattedRoundTrip := format.PrettyXML(xmlRoundTrip, verifyIndent)
66
67
		if formatted == formattedRoundTrip {
68
			fmt.Fprintln(os.Stderr, "OK: round-trip is lossless")
69
			return nil
70
		}
71
72
		// Print unified diff with colored inline highlights
73
		linesA := strings.Split(formatted, "\n")
74
		linesB := strings.Split(formattedRoundTrip, "\n")
75
76
		fmt.Fprintln(os.Stderr, "MISMATCH: round-trip produced different output")
77
		fmt.Fprintln(os.Stderr, "")
78
		fmt.Fprintf(os.Stderr, "%s--- original (formatted)%s\n", ansiRed, ansiReset)
79
		fmt.Fprintf(os.Stderr, "%s+++ round-trip (formatted)%s\n", ansiGreen, ansiReset)
80
81
		ops := computeDiffOps(linesA, linesB)
82
		hunks := buildHunks(ops, 3)
83
		for _, h := range hunks {
84
			printHunk(h)
85
		}
86
87
		os.Exit(1)
88
		return nil
89
	},
90
}
91
92
func init() {
93
	verifyCmd.Flags().StringVar(&verifyIndent, "indent", "  ", "Indentation string (default: 2 spaces)")
94
	rootCmd.AddCommand(verifyCmd)
95
}
96
97
var (
98
	// reEmptyParagraph matches empty paragraphs like <p><br /></p>, <p><br/></p>, etc.
99
	reEmptyParagraph = regexp.MustCompile(`<p>\s*<br\s*/?>\s*</p>`)
100
	// reSpanInCode matches <span...>...</span> inside <code>, unwrapping to just the text.
101
	reSpanInCode = regexp.MustCompile(`(<code>[^<]*)<span[^>]*>([^<]*)</span>`)
102
	// reAdjacentCode matches </code><code> (directly adjacent), merging into one span.
103
	reAdjacentCode = regexp.MustCompile(`</code><code>`)
104
)
105
106
// normalizeForVerify strips XML patterns that cannot survive a round-trip
107
// through Markdown, so verify compares only what the converter can preserve.
108
func normalizeForVerify(xml string) string {
109
	xml = reEmptyParagraph.ReplaceAllString(xml, "")
110
	// Unwrap <span> inside <code> (apply repeatedly for nested cases)
111
	for reSpanInCode.MatchString(xml) {
112
		xml = reSpanInCode.ReplaceAllString(xml, "${1}${2}")
113
	}
114
	// Merge adjacent <code> elements
115
	xml = reAdjacentCode.ReplaceAllString(xml, "")
116
	return xml
117
}
118
119
// ANSI escape codes for diff output.
120
const (
121
	ansiReset  = "\033[0m"
122
	ansiRed    = "\033[31m"
123
	ansiGreen  = "\033[32m"
124
	ansiCyan   = "\033[36m"
125
	ansiBold   = "\033[1m"
126
	ansiRedBg  = "\033[41;37m" // red background, white text
127
	ansiGrnBg  = "\033[42;30m" // green background, black text
128
)
129
130
// diffOp represents a line-level diff operation.
131
type diffOp int
132
133
const (
134
	opEqual  diffOp = iota
135
	opRemove        // line only in A
136
	opAdd           // line only in B
137
)
138
139
// diffLine is a single line in the diff with its operation and source positions.
140
type diffLine struct {
141
	op    diffOp
142
	text  string
143
	lineA int // 1-based line number in A (-1 if not applicable)
144
	lineB int // 1-based line number in B (-1 if not applicable)
145
}
146
147
// hunk is a group of diff lines with surrounding context.
148
type hunk struct {
149
	startA, countA int // 1-based start and count for A
150
	startB, countB int // 1-based start and count for B
151
	lines          []diffLine
152
}
153
154
// computeDiffOps produces a sequence of diff operations from two line slices
155
// using LCS-based algorithm.
156
func computeDiffOps(a, b []string) []diffLine {
157
	m, n := len(a), len(b)
158
	dp := make([][]int, m+1)
159
	for i := range dp {
160
		dp[i] = make([]int, n+1)
161
	}
162
	for i := 1; i <= m; i++ {
163
		for j := 1; j <= n; j++ {
164
			if a[i-1] == b[j-1] {
165
				dp[i][j] = dp[i-1][j-1] + 1
166
			} else if dp[i-1][j] >= dp[i][j-1] {
167
				dp[i][j] = dp[i-1][j]
168
			} else {
169
				dp[i][j] = dp[i][j-1]
170
			}
171
		}
172
	}
173
174
	// Backtrack to produce operations
175
	var ops []diffLine
176
	i, j := m, n
177
	for i > 0 || j > 0 {
178
		if i > 0 && j > 0 && a[i-1] == b[j-1] {
179
			ops = append(ops, diffLine{op: opEqual, text: a[i-1], lineA: i, lineB: j})
180
			i--
181
			j--
182
		} else if j > 0 && (i == 0 || dp[i][j-1] >= dp[i-1][j]) {
183
			ops = append(ops, diffLine{op: opAdd, text: b[j-1], lineA: -1, lineB: j})
184
			j--
185
		} else {
186
			ops = append(ops, diffLine{op: opRemove, text: a[i-1], lineA: i, lineB: -1})
187
			i--
188
		}
189
	}
190
	// Reverse — we built it backwards
191
	for l, r := 0, len(ops)-1; l < r; l, r = l+1, r-1 {
192
		ops[l], ops[r] = ops[r], ops[l]
193
	}
194
	return ops
195
}
196
197
// buildHunks groups diff operations into unified-diff hunks with `ctx` context lines.
198
func buildHunks(ops []diffLine, ctx int) []hunk {
199
	// Find ranges of changed lines, expanded by context
200
	type span struct{ start, end int } // indices into ops
201
	var changed []span
202
	for i, op := range ops {
203
		if op.op != opEqual {
204
			if len(changed) > 0 && i-changed[len(changed)-1].end <= 2*ctx {
205
				// Merge with previous span
206
				changed[len(changed)-1].end = i + 1
207
			} else {
208
				changed = append(changed, span{i, i + 1})
209
			}
210
		}
211
	}
212
213
	var hunks []hunk
214
	for _, ch := range changed {
215
		lo := ch.start - ctx
216
		if lo < 0 {
217
			lo = 0
218
		}
219
		hi := ch.end + ctx
220
		if hi > len(ops) {
221
			hi = len(ops)
222
		}
223
224
		h := hunk{lines: ops[lo:hi]}
225
226
		// Compute start lines and counts
227
		h.startA, h.startB = 1, 1
228
		if len(h.lines) > 0 {
229
			// Find first valid line numbers
230
			for _, dl := range h.lines {
231
				if dl.lineA > 0 {
232
					h.startA = dl.lineA
233
					break
234
				}
235
				if dl.lineB > 0 {
236
					h.startB = dl.lineB
237
					break
238
				}
239
			}
240
			if h.lines[0].lineA > 0 {
241
				h.startA = h.lines[0].lineA
242
			}
243
			if h.lines[0].lineB > 0 {
244
				h.startB = h.lines[0].lineB
245
			}
246
		}
247
		for _, dl := range h.lines {
248
			if dl.op == opEqual || dl.op == opRemove {
249
				h.countA++
250
			}
251
			if dl.op == opEqual || dl.op == opAdd {
252
				h.countB++
253
			}
254
		}
255
		hunks = append(hunks, h)
256
	}
257
	return hunks
258
}
259
260
// printHunk outputs a single unified diff hunk with ANSI colors and inline highlights.
261
func printHunk(h hunk) {
262
	// @@ header
263
	fmt.Fprintf(os.Stdout, "%s@@ -%d,%d +%d,%d @@%s\n",
264
		ansiCyan, h.startA, h.countA, h.startB, h.countB, ansiReset)
265
266
	lines := h.lines
267
268
	for i := 0; i < len(lines); i++ {
269
		dl := lines[i]
270
		switch dl.op {
271
		case opEqual:
272
			fmt.Printf(" %s\n", dl.text)
273
274
		case opRemove:
275
			// Try to pair with subsequent add(s) for inline highlighting
276
			remStart := i
277
			for i+1 < len(lines) && lines[i+1].op == opRemove {
278
				i++
279
			}
280
			remEnd := i + 1
281
			addStart := remEnd
282
			j := addStart
283
			for j < len(lines) && lines[j].op == opAdd {
284
				j++
285
			}
286
			addEnd := j
287
288
			removed := lines[remStart:remEnd]
289
			added := lines[addStart:addEnd]
290
291
			// Pair up removed/added lines for inline diff
292
			pairs := min(len(removed), len(added))
293
			for p := range pairs {
294
				hl, hr := inlineHighlight(removed[p].text, added[p].text)
295
				fmt.Printf("%s-%s%s\n", ansiRed, hl, ansiReset)
296
				fmt.Printf("%s+%s%s\n", ansiGreen, hr, ansiReset)
297
			}
298
			// Remaining unpaired lines
299
			for p := pairs; p < len(removed); p++ {
300
				fmt.Printf("%s-%s%s\n", ansiRed, removed[p].text, ansiReset)
301
			}
302
			for p := pairs; p < len(added); p++ {
303
				fmt.Printf("%s+%s%s\n", ansiGreen, added[p].text, ansiReset)
304
			}
305
306
			i = addEnd - 1 // -1 because loop increments
307
308
		case opAdd:
309
			// Unpaired add (not preceded by remove)
310
			fmt.Printf("%s+%s%s\n", ansiGreen, dl.text, ansiReset)
311
		}
312
	}
313
}
314
315
// inlineHighlight returns two strings (for removed and added lines) with ANSI
316
// bold marking on the parts that actually differ.
317
func inlineHighlight(a, b string) (string, string) {
318
	ra := []rune(a)
319
	rb := []rune(b)
320
321
	// Common prefix
322
	pfx := 0
323
	for pfx < len(ra) && pfx < len(rb) && ra[pfx] == rb[pfx] {
324
		pfx++
325
	}
326
	// Common suffix (from the end, but don't overlap with prefix)
327
	sfx := 0
328
	for sfx < len(ra)-pfx && sfx < len(rb)-pfx && ra[len(ra)-1-sfx] == rb[len(rb)-1-sfx] {
329
		sfx++
330
	}
331
332
	midA := ra[pfx : len(ra)-sfx]
333
	midB := rb[pfx : len(rb)-sfx]
334
335
	if len(midA) == 0 && len(midB) == 0 {
336
		// Lines are identical — no highlighting needed
337
		return a, b
338
	}
339
340
	prefix := string(ra[:pfx])
341
	suffix := string(ra[len(ra)-sfx:])
342
343
	hlA := prefix + ansiBold + ansiRedBg + string(midA) + ansiReset + ansiRed + suffix
344
	hlB := prefix + ansiBold + ansiGrnBg + string(midB) + ansiReset + ansiGreen + suffix
345
346
	return hlA, hlB
347
}
348

Source Files