verify.go

v0.1.0

package main

import (
	"fmt"
	"io"
	"os"
	"regexp"
	"strings"

	"github.com/spf13/cobra"

	"go.bigb.es/confluence-md-utilities/converter"
	"go.bigb.es/confluence-md-utilities/format"
)

var (
	verifyIndent string
)

var verifyCmd = &cobra.Command{
	Use:   "verify [input.xml]",
	Short: "Verify round-trip fidelity of XML → Markdown → XML conversion",
	Long: `Check that Confluence XML survives a round-trip through Markdown and back.

Compares:
  A = fmt(input XML)
  B = fmt(xml2md(input XML) → md2xml → XML)

If A and B match, the round-trip is lossless. Otherwise, prints a diff.

Reads from stdin if no file is specified.`,
	Args: cobra.MaximumNArgs(1),
	RunE: func(cmd *cobra.Command, args []string) error {
		var input []byte
		var err error

		if len(args) > 0 {
			input, err = os.ReadFile(args[0])
		} else {
			input, err = io.ReadAll(os.Stdin)
		}
		if err != nil {
			return fmt.Errorf("reading input: %w", err)
		}

		xmlInput := string(input)

		// Normalize input: remove elements that cannot survive round-trip
		xmlInput = normalizeForVerify(xmlInput)

		// A: format the original XML
		formatted := format.PrettyXML(xmlInput, verifyIndent)

		// B: XML → Markdown → XML → format
		md, err := converter.ConfluenceToMarkdown(xmlInput)
		if err != nil {
			return fmt.Errorf("xml→markdown: %w", err)
		}

		xmlRoundTrip, err := converter.MarkdownToConfluence([]byte(md))
		if err != nil {
			return fmt.Errorf("markdown→xml: %w", err)
		}

		formattedRoundTrip := format.PrettyXML(xmlRoundTrip, verifyIndent)

		if formatted == formattedRoundTrip {
			fmt.Fprintln(os.Stderr, "OK: round-trip is lossless")
			return nil
		}

		// Print unified diff with colored inline highlights
		linesA := strings.Split(formatted, "\n")
		linesB := strings.Split(formattedRoundTrip, "\n")

		fmt.Fprintln(os.Stderr, "MISMATCH: round-trip produced different output")
		fmt.Fprintln(os.Stderr, "")
		fmt.Fprintf(os.Stderr, "%s--- original (formatted)%s\n", ansiRed, ansiReset)
		fmt.Fprintf(os.Stderr, "%s+++ round-trip (formatted)%s\n", ansiGreen, ansiReset)

		ops := computeDiffOps(linesA, linesB)
		hunks := buildHunks(ops, 3)
		for _, h := range hunks {
			printHunk(h)
		}

		os.Exit(1)
		return nil
	},
}

func init() {
	verifyCmd.Flags().StringVar(&verifyIndent, "indent", "  ", "Indentation string (default: 2 spaces)")
	rootCmd.AddCommand(verifyCmd)
}

var (
	// reEmptyParagraph matches empty paragraphs like <p><br /></p>, <p><br/></p>, etc.
	reEmptyParagraph = regexp.MustCompile(`<p>\s*<br\s*/?>\s*</p>`)
	// reSpanInCode matches <span...>...</span> inside <code>, unwrapping to just the text.
	reSpanInCode = regexp.MustCompile(`(<code>[^<]*)<span[^>]*>([^<]*)</span>`)
	// reAdjacentCode matches </code><code> (directly adjacent), merging into one span.
	reAdjacentCode = regexp.MustCompile(`</code><code>`)
)

// normalizeForVerify strips XML patterns that cannot survive a round-trip
// through Markdown, so verify compares only what the converter can preserve.
func normalizeForVerify(xml string) string {
	xml = reEmptyParagraph.ReplaceAllString(xml, "")
	// Unwrap <span> inside <code> (apply repeatedly for nested cases)
	for reSpanInCode.MatchString(xml) {
		xml = reSpanInCode.ReplaceAllString(xml, "${1}${2}")
	}
	// Merge adjacent <code> elements
	xml = reAdjacentCode.ReplaceAllString(xml, "")
	return xml
}

// ANSI escape codes for diff output.
const (
	ansiReset  = "\033[0m"
	ansiRed    = "\033[31m"
	ansiGreen  = "\033[32m"
	ansiCyan   = "\033[36m"
	ansiBold   = "\033[1m"
	ansiRedBg  = "\033[41;37m" // red background, white text
	ansiGrnBg  = "\033[42;30m" // green background, black text
)

// diffOp represents a line-level diff operation.
type diffOp int

const (
	opEqual  diffOp = iota
	opRemove        // line only in A
	opAdd           // line only in B
)

// diffLine is a single line in the diff with its operation and source positions.
type diffLine struct {
	op    diffOp
	text  string
	lineA int // 1-based line number in A (-1 if not applicable)
	lineB int // 1-based line number in B (-1 if not applicable)
}

// hunk is a group of diff lines with surrounding context.
type hunk struct {
	startA, countA int // 1-based start and count for A
	startB, countB int // 1-based start and count for B
	lines          []diffLine
}

// computeDiffOps produces a sequence of diff operations from two line slices
// using LCS-based algorithm.
func computeDiffOps(a, b []string) []diffLine {
	m, n := len(a), len(b)
	dp := make([][]int, m+1)
	for i := range dp {
		dp[i] = make([]int, n+1)
	}
	for i := 1; i <= m; i++ {
		for j := 1; j <= n; j++ {
			if a[i-1] == b[j-1] {
				dp[i][j] = dp[i-1][j-1] + 1
			} else if dp[i-1][j] >= dp[i][j-1] {
				dp[i][j] = dp[i-1][j]
			} else {
				dp[i][j] = dp[i][j-1]
			}
		}
	}

	// Backtrack to produce operations
	var ops []diffLine
	i, j := m, n
	for i > 0 || j > 0 {
		if i > 0 && j > 0 && a[i-1] == b[j-1] {
			ops = append(ops, diffLine{op: opEqual, text: a[i-1], lineA: i, lineB: j})
			i--
			j--
		} else if j > 0 && (i == 0 || dp[i][j-1] >= dp[i-1][j]) {
			ops = append(ops, diffLine{op: opAdd, text: b[j-1], lineA: -1, lineB: j})
			j--
		} else {
			ops = append(ops, diffLine{op: opRemove, text: a[i-1], lineA: i, lineB: -1})
			i--
		}
	}
	// Reverse — we built it backwards
	for l, r := 0, len(ops)-1; l < r; l, r = l+1, r-1 {
		ops[l], ops[r] = ops[r], ops[l]
	}
	return ops
}

// buildHunks groups diff operations into unified-diff hunks with `ctx` context lines.
func buildHunks(ops []diffLine, ctx int) []hunk {
	// Find ranges of changed lines, expanded by context
	type span struct{ start, end int } // indices into ops
	var changed []span
	for i, op := range ops {
		if op.op != opEqual {
			if len(changed) > 0 && i-changed[len(changed)-1].end <= 2*ctx {
				// Merge with previous span
				changed[len(changed)-1].end = i + 1
			} else {
				changed = append(changed, span{i, i + 1})
			}
		}
	}

	var hunks []hunk
	for _, ch := range changed {
		lo := ch.start - ctx
		if lo < 0 {
			lo = 0
		}
		hi := ch.end + ctx
		if hi > len(ops) {
			hi = len(ops)
		}

		h := hunk{lines: ops[lo:hi]}

		// Compute start lines and counts
		h.startA, h.startB = 1, 1
		if len(h.lines) > 0 {
			// Find first valid line numbers
			for _, dl := range h.lines {
				if dl.lineA > 0 {
					h.startA = dl.lineA
					break
				}
				if dl.lineB > 0 {
					h.startB = dl.lineB
					break
				}
			}
			if h.lines[0].lineA > 0 {
				h.startA = h.lines[0].lineA
			}
			if h.lines[0].lineB > 0 {
				h.startB = h.lines[0].lineB
			}
		}
		for _, dl := range h.lines {
			if dl.op == opEqual || dl.op == opRemove {
				h.countA++
			}
			if dl.op == opEqual || dl.op == opAdd {
				h.countB++
			}
		}
		hunks = append(hunks, h)
	}
	return hunks
}

// printHunk outputs a single unified diff hunk with ANSI colors and inline highlights.
func printHunk(h hunk) {
	// @@ header
	fmt.Fprintf(os.Stdout, "%s@@ -%d,%d +%d,%d @@%s\n",
		ansiCyan, h.startA, h.countA, h.startB, h.countB, ansiReset)

	lines := h.lines

	for i := 0; i < len(lines); i++ {
		dl := lines[i]
		switch dl.op {
		case opEqual:
			fmt.Printf(" %s\n", dl.text)

		case opRemove:
			// Try to pair with subsequent add(s) for inline highlighting
			remStart := i
			for i+1 < len(lines) && lines[i+1].op == opRemove {
				i++
			}
			remEnd := i + 1
			addStart := remEnd
			j := addStart
			for j < len(lines) && lines[j].op == opAdd {
				j++
			}
			addEnd := j

			removed := lines[remStart:remEnd]
			added := lines[addStart:addEnd]

			// Pair up removed/added lines for inline diff
			pairs := min(len(removed), len(added))
			for p := range pairs {
				hl, hr := inlineHighlight(removed[p].text, added[p].text)
				fmt.Printf("%s-%s%s\n", ansiRed, hl, ansiReset)
				fmt.Printf("%s+%s%s\n", ansiGreen, hr, ansiReset)
			}
			// Remaining unpaired lines
			for p := pairs; p < len(removed); p++ {
				fmt.Printf("%s-%s%s\n", ansiRed, removed[p].text, ansiReset)
			}
			for p := pairs; p < len(added); p++ {
				fmt.Printf("%s+%s%s\n", ansiGreen, added[p].text, ansiReset)
			}

			i = addEnd - 1 // -1 because loop increments

		case opAdd:
			// Unpaired add (not preceded by remove)
			fmt.Printf("%s+%s%s\n", ansiGreen, dl.text, ansiReset)
		}
	}
}

// inlineHighlight returns two strings (for removed and added lines) with ANSI
// bold marking on the parts that actually differ.
func inlineHighlight(a, b string) (string, string) {
	ra := []rune(a)
	rb := []rune(b)

	// Common prefix
	pfx := 0
	for pfx < len(ra) && pfx < len(rb) && ra[pfx] == rb[pfx] {
		pfx++
	}
	// Common suffix (from the end, but don't overlap with prefix)
	sfx := 0
	for sfx < len(ra)-pfx && sfx < len(rb)-pfx && ra[len(ra)-1-sfx] == rb[len(rb)-1-sfx] {
		sfx++
	}

	midA := ra[pfx : len(ra)-sfx]
	midB := rb[pfx : len(rb)-sfx]

	if len(midA) == 0 && len(midB) == 0 {
		// Lines are identical — no highlighting needed
		return a, b
	}

	prefix := string(ra[:pfx])
	suffix := string(ra[len(ra)-sfx:])

	hlA := prefix + ansiBold + ansiRedBg + string(midA) + ansiReset + ansiRed + suffix
	hlB := prefix + ansiBold + ansiGrnBg + string(midB) + ansiReset + ansiGreen + suffix

	return hlA, hlB
}


1	package main
2
3	import (
4	"fmt"
5	"io"
6	"os"
7	"regexp"
8	"strings"
9
10	"github.com/spf13/cobra"
11
12	"go.bigb.es/confluence-md-utilities/converter"
13	"go.bigb.es/confluence-md-utilities/format"
14	)
15
16	var (
17	verifyIndent string
18	)
19
20	var verifyCmd = &cobra.Command{
21	Use: "verify [input.xml]",
22	Short: "Verify round-trip fidelity of XML → Markdown → XML conversion",
23	Long: `Check that Confluence XML survives a round-trip through Markdown and back.
24
25	Compares:
26	A = fmt(input XML)
27	B = fmt(xml2md(input XML) → md2xml → XML)
28
29	If A and B match, the round-trip is lossless. Otherwise, prints a diff.
30
31	Reads from stdin if no file is specified.`,
32	Args: cobra.MaximumNArgs(1),
33	RunE: func(cmd *cobra.Command, args []string) error {
34	var input []byte
35	var err error
36
37	if len(args) > 0 {
38	input, err = os.ReadFile(args[0])
39	} else {
40	input, err = io.ReadAll(os.Stdin)
41	}
42	if err != nil {
43	return fmt.Errorf("reading input: %w", err)
44	}
45
46	xmlInput := string(input)
47
48	// Normalize input: remove elements that cannot survive round-trip
49	xmlInput = normalizeForVerify(xmlInput)
50
51	// A: format the original XML
52	formatted := format.PrettyXML(xmlInput, verifyIndent)
53
54	// B: XML → Markdown → XML → format
55	md, err := converter.ConfluenceToMarkdown(xmlInput)
56	if err != nil {
57	return fmt.Errorf("xml→markdown: %w", err)
58	}
59
60	xmlRoundTrip, err := converter.MarkdownToConfluence([]byte(md))
61	if err != nil {
62	return fmt.Errorf("markdown→xml: %w", err)
63	}
64
65	formattedRoundTrip := format.PrettyXML(xmlRoundTrip, verifyIndent)
66
67	if formatted == formattedRoundTrip {
68	fmt.Fprintln(os.Stderr, "OK: round-trip is lossless")
69	return nil
70	}
71
72	// Print unified diff with colored inline highlights
73	linesA := strings.Split(formatted, "\n")
74	linesB := strings.Split(formattedRoundTrip, "\n")
75
76	fmt.Fprintln(os.Stderr, "MISMATCH: round-trip produced different output")
77	fmt.Fprintln(os.Stderr, "")
78	fmt.Fprintf(os.Stderr, "%s--- original (formatted)%s\n", ansiRed, ansiReset)
79	fmt.Fprintf(os.Stderr, "%s+++ round-trip (formatted)%s\n", ansiGreen, ansiReset)
80
81	ops := computeDiffOps(linesA, linesB)
82	hunks := buildHunks(ops, 3)
83	for _, h := range hunks {
84	printHunk(h)
85	}
86
87	os.Exit(1)
88	return nil
89	},
90	}
91
92	func init() {
93	verifyCmd.Flags().StringVar(&verifyIndent, "indent", " ", "Indentation string (default: 2 spaces)")
94	rootCmd.AddCommand(verifyCmd)
95	}
96
97	var (
98	// reEmptyParagraph matches empty paragraphs like <p><br /></p>, <p><br/></p>, etc.
99	reEmptyParagraph = regexp.MustCompile(`<p>\s<br\s/?>\s*</p>`)
100	// reSpanInCode matches <span...>...</span> inside <code>, unwrapping to just the text.
101	reSpanInCode = regexp.MustCompile(`(<code>[^<])<span[^>]>([^<]*)</span>`)
102	// reAdjacentCode matches </code><code> (directly adjacent), merging into one span.
103	reAdjacentCode = regexp.MustCompile(`</code><code>`)
104	)
105
106	// normalizeForVerify strips XML patterns that cannot survive a round-trip
107	// through Markdown, so verify compares only what the converter can preserve.
108	func normalizeForVerify(xml string) string {
109	xml = reEmptyParagraph.ReplaceAllString(xml, "")
110	// Unwrap <span> inside <code> (apply repeatedly for nested cases)
111	for reSpanInCode.MatchString(xml) {
112	xml = reSpanInCode.ReplaceAllString(xml, "${1}${2}")
113	}
114	// Merge adjacent <code> elements
115	xml = reAdjacentCode.ReplaceAllString(xml, "")
116	return xml
117	}
118
119	// ANSI escape codes for diff output.
120	const (
121	ansiReset = "\033[0m"
122	ansiRed = "\033[31m"
123	ansiGreen = "\033[32m"
124	ansiCyan = "\033[36m"
125	ansiBold = "\033[1m"
126	ansiRedBg = "\033[41;37m" // red background, white text
127	ansiGrnBg = "\033[42;30m" // green background, black text
128	)
129
130	// diffOp represents a line-level diff operation.
131	type diffOp int
132
133	const (
134	opEqual diffOp = iota
135	opRemove // line only in A
136	opAdd // line only in B
137	)
138
139	// diffLine is a single line in the diff with its operation and source positions.
140	type diffLine struct {
141	op diffOp
142	text string
143	lineA int // 1-based line number in A (-1 if not applicable)
144	lineB int // 1-based line number in B (-1 if not applicable)
145	}
146
147	// hunk is a group of diff lines with surrounding context.
148	type hunk struct {
149	startA, countA int // 1-based start and count for A
150	startB, countB int // 1-based start and count for B
151	lines []diffLine
152	}
153
154	// computeDiffOps produces a sequence of diff operations from two line slices
155	// using LCS-based algorithm.
156	func computeDiffOps(a, b []string) []diffLine {
157	m, n := len(a), len(b)
158	dp := make([][]int, m+1)
159	for i := range dp {
160	dp[i] = make([]int, n+1)
161	}
162	for i := 1; i <= m; i++ {
163	for j := 1; j <= n; j++ {
164	if a[i-1] == b[j-1] {
165	dp[i][j] = dp[i-1][j-1] + 1
166	} else if dp[i-1][j] >= dp[i][j-1] {
167	dp[i][j] = dp[i-1][j]
168	} else {
169	dp[i][j] = dp[i][j-1]
170	}
171	}
172	}
173
174	// Backtrack to produce operations
175	var ops []diffLine
176	i, j := m, n
177	for i > 0 \|\| j > 0 {
178	if i > 0 && j > 0 && a[i-1] == b[j-1] {
179	ops = append(ops, diffLine{op: opEqual, text: a[i-1], lineA: i, lineB: j})
180	i--
181	j--
182	} else if j > 0 && (i == 0 \|\| dp[i][j-1] >= dp[i-1][j]) {
183	ops = append(ops, diffLine{op: opAdd, text: b[j-1], lineA: -1, lineB: j})
184	j--
185	} else {
186	ops = append(ops, diffLine{op: opRemove, text: a[i-1], lineA: i, lineB: -1})
187	i--
188	}
189	}
190	// Reverse — we built it backwards
191	for l, r := 0, len(ops)-1; l < r; l, r = l+1, r-1 {
192	ops[l], ops[r] = ops[r], ops[l]
193	}
194	return ops
195	}
196
197	// buildHunks groups diff operations into unified-diff hunks with `ctx` context lines.
198	func buildHunks(ops []diffLine, ctx int) []hunk {
199	// Find ranges of changed lines, expanded by context
200	type span struct{ start, end int } // indices into ops
201	var changed []span
202	for i, op := range ops {
203	if op.op != opEqual {
204	if len(changed) > 0 && i-changed[len(changed)-1].end <= 2*ctx {
205	// Merge with previous span
206	changed[len(changed)-1].end = i + 1
207	} else {
208	changed = append(changed, span{i, i + 1})
209	}
210	}
211	}
212
213	var hunks []hunk
214	for _, ch := range changed {
215	lo := ch.start - ctx
216	if lo < 0 {
217	lo = 0
218	}
219	hi := ch.end + ctx
220	if hi > len(ops) {
221	hi = len(ops)
222	}
223
224	h := hunk{lines: ops[lo:hi]}
225
226	// Compute start lines and counts
227	h.startA, h.startB = 1, 1
228	if len(h.lines) > 0 {
229	// Find first valid line numbers
230	for _, dl := range h.lines {
231	if dl.lineA > 0 {
232	h.startA = dl.lineA
233	break
234	}
235	if dl.lineB > 0 {
236	h.startB = dl.lineB
237	break
238	}
239	}
240	if h.lines[0].lineA > 0 {
241	h.startA = h.lines[0].lineA
242	}
243	if h.lines[0].lineB > 0 {
244	h.startB = h.lines[0].lineB
245	}
246	}
247	for _, dl := range h.lines {
248	if dl.op == opEqual \|\| dl.op == opRemove {
249	h.countA++
250	}
251	if dl.op == opEqual \|\| dl.op == opAdd {
252	h.countB++
253	}
254	}
255	hunks = append(hunks, h)
256	}
257	return hunks
258	}
259
260	// printHunk outputs a single unified diff hunk with ANSI colors and inline highlights.
261	func printHunk(h hunk) {
262	// @@ header
263	fmt.Fprintf(os.Stdout, "%s@@ -%d,%d +%d,%d @@%s\n",
264	ansiCyan, h.startA, h.countA, h.startB, h.countB, ansiReset)
265
266	lines := h.lines
267
268	for i := 0; i < len(lines); i++ {
269	dl := lines[i]
270	switch dl.op {
271	case opEqual:
272	fmt.Printf(" %s\n", dl.text)
273
274	case opRemove:
275	// Try to pair with subsequent add(s) for inline highlighting
276	remStart := i
277	for i+1 < len(lines) && lines[i+1].op == opRemove {
278	i++
279	}
280	remEnd := i + 1
281	addStart := remEnd
282	j := addStart
283	for j < len(lines) && lines[j].op == opAdd {
284	j++
285	}
286	addEnd := j
287
288	removed := lines[remStart:remEnd]
289	added := lines[addStart:addEnd]
290
291	// Pair up removed/added lines for inline diff
292	pairs := min(len(removed), len(added))
293	for p := range pairs {
294	hl, hr := inlineHighlight(removed[p].text, added[p].text)
295	fmt.Printf("%s-%s%s\n", ansiRed, hl, ansiReset)
296	fmt.Printf("%s+%s%s\n", ansiGreen, hr, ansiReset)
297	}
298	// Remaining unpaired lines
299	for p := pairs; p < len(removed); p++ {
300	fmt.Printf("%s-%s%s\n", ansiRed, removed[p].text, ansiReset)
301	}
302	for p := pairs; p < len(added); p++ {
303	fmt.Printf("%s+%s%s\n", ansiGreen, added[p].text, ansiReset)
304	}
305
306	i = addEnd - 1 // -1 because loop increments
307
308	case opAdd:
309	// Unpaired add (not preceded by remove)
310	fmt.Printf("%s+%s%s\n", ansiGreen, dl.text, ansiReset)
311	}
312	}
313	}
314
315	// inlineHighlight returns two strings (for removed and added lines) with ANSI
316	// bold marking on the parts that actually differ.
317	func inlineHighlight(a, b string) (string, string) {
318	ra := []rune(a)
319	rb := []rune(b)
320
321	// Common prefix
322	pfx := 0
323	for pfx < len(ra) && pfx < len(rb) && ra[pfx] == rb[pfx] {
324	pfx++
325	}
326	// Common suffix (from the end, but don't overlap with prefix)
327	sfx := 0
328	for sfx < len(ra)-pfx && sfx < len(rb)-pfx && ra[len(ra)-1-sfx] == rb[len(rb)-1-sfx] {
329	sfx++
330	}
331
332	midA := ra[pfx : len(ra)-sfx]
333	midB := rb[pfx : len(rb)-sfx]
334
335	if len(midA) == 0 && len(midB) == 0 {
336	// Lines are identical — no highlighting needed
337	return a, b
338	}
339
340	prefix := string(ra[:pfx])
341	suffix := string(ra[len(ra)-sfx:])
342
343	hlA := prefix + ansiBold + ansiRedBg + string(midA) + ansiReset + ansiRed + suffix
344	hlB := prefix + ansiBold + ansiGrnBg + string(midB) + ansiReset + ansiGreen + suffix
345
346	return hlA, hlB
347	}
348

verify.go

Source Files