| 1 | package main |
| 2 | |
| 3 | import ( |
| 4 | "fmt" |
| 5 | "io" |
| 6 | "os" |
| 7 | "regexp" |
| 8 | "strings" |
| 9 | |
| 10 | "github.com/spf13/cobra" |
| 11 | |
| 12 | "go.bigb.es/confluence-md-utilities/converter" |
| 13 | "go.bigb.es/confluence-md-utilities/format" |
| 14 | ) |
| 15 | |
| 16 | var ( |
| 17 | verifyIndent string |
| 18 | ) |
| 19 | |
| 20 | var verifyCmd = &cobra.Command{ |
| 21 | Use: "verify [input.xml]", |
| 22 | Short: "Verify round-trip fidelity of XML → Markdown → XML conversion", |
| 23 | Long: `Check that Confluence XML survives a round-trip through Markdown and back. |
| 24 |
|
| 25 | Compares: |
| 26 | A = fmt(input XML) |
| 27 | B = fmt(xml2md(input XML) → md2xml → XML) |
| 28 |
|
| 29 | If A and B match, the round-trip is lossless. Otherwise, prints a diff. |
| 30 |
|
| 31 | Reads from stdin if no file is specified.`, |
| 32 | Args: cobra.MaximumNArgs(1), |
| 33 | RunE: func(cmd *cobra.Command, args []string) error { |
| 34 | var input []byte |
| 35 | var err error |
| 36 | |
| 37 | if len(args) > 0 { |
| 38 | input, err = os.ReadFile(args[0]) |
| 39 | } else { |
| 40 | input, err = io.ReadAll(os.Stdin) |
| 41 | } |
| 42 | if err != nil { |
| 43 | return fmt.Errorf("reading input: %w", err) |
| 44 | } |
| 45 | |
| 46 | xmlInput := string(input) |
| 47 | |
| 48 | // Normalize input: remove elements that cannot survive round-trip |
| 49 | xmlInput = normalizeForVerify(xmlInput) |
| 50 | |
| 51 | // A: format the original XML |
| 52 | formatted := format.PrettyXML(xmlInput, verifyIndent) |
| 53 | |
| 54 | // B: XML → Markdown → XML → format |
| 55 | md, err := converter.ConfluenceToMarkdown(xmlInput) |
| 56 | if err != nil { |
| 57 | return fmt.Errorf("xml→markdown: %w", err) |
| 58 | } |
| 59 | |
| 60 | xmlRoundTrip, err := converter.MarkdownToConfluence([]byte(md)) |
| 61 | if err != nil { |
| 62 | return fmt.Errorf("markdown→xml: %w", err) |
| 63 | } |
| 64 | |
| 65 | formattedRoundTrip := format.PrettyXML(xmlRoundTrip, verifyIndent) |
| 66 | |
| 67 | if formatted == formattedRoundTrip { |
| 68 | fmt.Fprintln(os.Stderr, "OK: round-trip is lossless") |
| 69 | return nil |
| 70 | } |
| 71 | |
| 72 | // Print unified diff with colored inline highlights |
| 73 | linesA := strings.Split(formatted, "\n") |
| 74 | linesB := strings.Split(formattedRoundTrip, "\n") |
| 75 | |
| 76 | fmt.Fprintln(os.Stderr, "MISMATCH: round-trip produced different output") |
| 77 | fmt.Fprintln(os.Stderr, "") |
| 78 | fmt.Fprintf(os.Stderr, "%s--- original (formatted)%s\n", ansiRed, ansiReset) |
| 79 | fmt.Fprintf(os.Stderr, "%s+++ round-trip (formatted)%s\n", ansiGreen, ansiReset) |
| 80 | |
| 81 | ops := computeDiffOps(linesA, linesB) |
| 82 | hunks := buildHunks(ops, 3) |
| 83 | for _, h := range hunks { |
| 84 | printHunk(h) |
| 85 | } |
| 86 | |
| 87 | os.Exit(1) |
| 88 | return nil |
| 89 | }, |
| 90 | } |
| 91 | |
| 92 | func init() { |
| 93 | verifyCmd.Flags().StringVar(&verifyIndent, "indent", " ", "Indentation string (default: 2 spaces)") |
| 94 | rootCmd.AddCommand(verifyCmd) |
| 95 | } |
| 96 | |
| 97 | var ( |
| 98 | // reEmptyParagraph matches empty paragraphs like <p><br /></p>, <p><br/></p>, etc. |
| 99 | reEmptyParagraph = regexp.MustCompile(`<p>\s*<br\s*/?>\s*</p>`) |
| 100 | // reSpanInCode matches <span...>...</span> inside <code>, unwrapping to just the text. |
| 101 | reSpanInCode = regexp.MustCompile(`(<code>[^<]*)<span[^>]*>([^<]*)</span>`) |
| 102 | // reAdjacentCode matches </code><code> (directly adjacent), merging into one span. |
| 103 | reAdjacentCode = regexp.MustCompile(`</code><code>`) |
| 104 | ) |
| 105 | |
| 106 | // normalizeForVerify strips XML patterns that cannot survive a round-trip |
| 107 | // through Markdown, so verify compares only what the converter can preserve. |
| 108 | func normalizeForVerify(xml string) string { |
| 109 | xml = reEmptyParagraph.ReplaceAllString(xml, "") |
| 110 | // Unwrap <span> inside <code> (apply repeatedly for nested cases) |
| 111 | for reSpanInCode.MatchString(xml) { |
| 112 | xml = reSpanInCode.ReplaceAllString(xml, "${1}${2}") |
| 113 | } |
| 114 | // Merge adjacent <code> elements |
| 115 | xml = reAdjacentCode.ReplaceAllString(xml, "") |
| 116 | return xml |
| 117 | } |
| 118 | |
| 119 | // ANSI escape codes for diff output. |
| 120 | const ( |
| 121 | ansiReset = "\033[0m" |
| 122 | ansiRed = "\033[31m" |
| 123 | ansiGreen = "\033[32m" |
| 124 | ansiCyan = "\033[36m" |
| 125 | ansiBold = "\033[1m" |
| 126 | ansiRedBg = "\033[41;37m" // red background, white text |
| 127 | ansiGrnBg = "\033[42;30m" // green background, black text |
| 128 | ) |
| 129 | |
| 130 | // diffOp represents a line-level diff operation. |
| 131 | type diffOp int |
| 132 | |
| 133 | const ( |
| 134 | opEqual diffOp = iota |
| 135 | opRemove // line only in A |
| 136 | opAdd // line only in B |
| 137 | ) |
| 138 | |
| 139 | // diffLine is a single line in the diff with its operation and source positions. |
| 140 | type diffLine struct { |
| 141 | op diffOp |
| 142 | text string |
| 143 | lineA int // 1-based line number in A (-1 if not applicable) |
| 144 | lineB int // 1-based line number in B (-1 if not applicable) |
| 145 | } |
| 146 | |
| 147 | // hunk is a group of diff lines with surrounding context. |
| 148 | type hunk struct { |
| 149 | startA, countA int // 1-based start and count for A |
| 150 | startB, countB int // 1-based start and count for B |
| 151 | lines []diffLine |
| 152 | } |
| 153 | |
| 154 | // computeDiffOps produces a sequence of diff operations from two line slices |
| 155 | // using LCS-based algorithm. |
| 156 | func computeDiffOps(a, b []string) []diffLine { |
| 157 | m, n := len(a), len(b) |
| 158 | dp := make([][]int, m+1) |
| 159 | for i := range dp { |
| 160 | dp[i] = make([]int, n+1) |
| 161 | } |
| 162 | for i := 1; i <= m; i++ { |
| 163 | for j := 1; j <= n; j++ { |
| 164 | if a[i-1] == b[j-1] { |
| 165 | dp[i][j] = dp[i-1][j-1] + 1 |
| 166 | } else if dp[i-1][j] >= dp[i][j-1] { |
| 167 | dp[i][j] = dp[i-1][j] |
| 168 | } else { |
| 169 | dp[i][j] = dp[i][j-1] |
| 170 | } |
| 171 | } |
| 172 | } |
| 173 | |
| 174 | // Backtrack to produce operations |
| 175 | var ops []diffLine |
| 176 | i, j := m, n |
| 177 | for i > 0 || j > 0 { |
| 178 | if i > 0 && j > 0 && a[i-1] == b[j-1] { |
| 179 | ops = append(ops, diffLine{op: opEqual, text: a[i-1], lineA: i, lineB: j}) |
| 180 | i-- |
| 181 | j-- |
| 182 | } else if j > 0 && (i == 0 || dp[i][j-1] >= dp[i-1][j]) { |
| 183 | ops = append(ops, diffLine{op: opAdd, text: b[j-1], lineA: -1, lineB: j}) |
| 184 | j-- |
| 185 | } else { |
| 186 | ops = append(ops, diffLine{op: opRemove, text: a[i-1], lineA: i, lineB: -1}) |
| 187 | i-- |
| 188 | } |
| 189 | } |
| 190 | // Reverse — we built it backwards |
| 191 | for l, r := 0, len(ops)-1; l < r; l, r = l+1, r-1 { |
| 192 | ops[l], ops[r] = ops[r], ops[l] |
| 193 | } |
| 194 | return ops |
| 195 | } |
| 196 | |
| 197 | // buildHunks groups diff operations into unified-diff hunks with `ctx` context lines. |
| 198 | func buildHunks(ops []diffLine, ctx int) []hunk { |
| 199 | // Find ranges of changed lines, expanded by context |
| 200 | type span struct{ start, end int } // indices into ops |
| 201 | var changed []span |
| 202 | for i, op := range ops { |
| 203 | if op.op != opEqual { |
| 204 | if len(changed) > 0 && i-changed[len(changed)-1].end <= 2*ctx { |
| 205 | // Merge with previous span |
| 206 | changed[len(changed)-1].end = i + 1 |
| 207 | } else { |
| 208 | changed = append(changed, span{i, i + 1}) |
| 209 | } |
| 210 | } |
| 211 | } |
| 212 | |
| 213 | var hunks []hunk |
| 214 | for _, ch := range changed { |
| 215 | lo := ch.start - ctx |
| 216 | if lo < 0 { |
| 217 | lo = 0 |
| 218 | } |
| 219 | hi := ch.end + ctx |
| 220 | if hi > len(ops) { |
| 221 | hi = len(ops) |
| 222 | } |
| 223 | |
| 224 | h := hunk{lines: ops[lo:hi]} |
| 225 | |
| 226 | // Compute start lines and counts |
| 227 | h.startA, h.startB = 1, 1 |
| 228 | if len(h.lines) > 0 { |
| 229 | // Find first valid line numbers |
| 230 | for _, dl := range h.lines { |
| 231 | if dl.lineA > 0 { |
| 232 | h.startA = dl.lineA |
| 233 | break |
| 234 | } |
| 235 | if dl.lineB > 0 { |
| 236 | h.startB = dl.lineB |
| 237 | break |
| 238 | } |
| 239 | } |
| 240 | if h.lines[0].lineA > 0 { |
| 241 | h.startA = h.lines[0].lineA |
| 242 | } |
| 243 | if h.lines[0].lineB > 0 { |
| 244 | h.startB = h.lines[0].lineB |
| 245 | } |
| 246 | } |
| 247 | for _, dl := range h.lines { |
| 248 | if dl.op == opEqual || dl.op == opRemove { |
| 249 | h.countA++ |
| 250 | } |
| 251 | if dl.op == opEqual || dl.op == opAdd { |
| 252 | h.countB++ |
| 253 | } |
| 254 | } |
| 255 | hunks = append(hunks, h) |
| 256 | } |
| 257 | return hunks |
| 258 | } |
| 259 | |
| 260 | // printHunk outputs a single unified diff hunk with ANSI colors and inline highlights. |
| 261 | func printHunk(h hunk) { |
| 262 | // @@ header |
| 263 | fmt.Fprintf(os.Stdout, "%s@@ -%d,%d +%d,%d @@%s\n", |
| 264 | ansiCyan, h.startA, h.countA, h.startB, h.countB, ansiReset) |
| 265 | |
| 266 | lines := h.lines |
| 267 | |
| 268 | for i := 0; i < len(lines); i++ { |
| 269 | dl := lines[i] |
| 270 | switch dl.op { |
| 271 | case opEqual: |
| 272 | fmt.Printf(" %s\n", dl.text) |
| 273 | |
| 274 | case opRemove: |
| 275 | // Try to pair with subsequent add(s) for inline highlighting |
| 276 | remStart := i |
| 277 | for i+1 < len(lines) && lines[i+1].op == opRemove { |
| 278 | i++ |
| 279 | } |
| 280 | remEnd := i + 1 |
| 281 | addStart := remEnd |
| 282 | j := addStart |
| 283 | for j < len(lines) && lines[j].op == opAdd { |
| 284 | j++ |
| 285 | } |
| 286 | addEnd := j |
| 287 | |
| 288 | removed := lines[remStart:remEnd] |
| 289 | added := lines[addStart:addEnd] |
| 290 | |
| 291 | // Pair up removed/added lines for inline diff |
| 292 | pairs := min(len(removed), len(added)) |
| 293 | for p := range pairs { |
| 294 | hl, hr := inlineHighlight(removed[p].text, added[p].text) |
| 295 | fmt.Printf("%s-%s%s\n", ansiRed, hl, ansiReset) |
| 296 | fmt.Printf("%s+%s%s\n", ansiGreen, hr, ansiReset) |
| 297 | } |
| 298 | // Remaining unpaired lines |
| 299 | for p := pairs; p < len(removed); p++ { |
| 300 | fmt.Printf("%s-%s%s\n", ansiRed, removed[p].text, ansiReset) |
| 301 | } |
| 302 | for p := pairs; p < len(added); p++ { |
| 303 | fmt.Printf("%s+%s%s\n", ansiGreen, added[p].text, ansiReset) |
| 304 | } |
| 305 | |
| 306 | i = addEnd - 1 // -1 because loop increments |
| 307 | |
| 308 | case opAdd: |
| 309 | // Unpaired add (not preceded by remove) |
| 310 | fmt.Printf("%s+%s%s\n", ansiGreen, dl.text, ansiReset) |
| 311 | } |
| 312 | } |
| 313 | } |
| 314 | |
| 315 | // inlineHighlight returns two strings (for removed and added lines) with ANSI |
| 316 | // bold marking on the parts that actually differ. |
| 317 | func inlineHighlight(a, b string) (string, string) { |
| 318 | ra := []rune(a) |
| 319 | rb := []rune(b) |
| 320 | |
| 321 | // Common prefix |
| 322 | pfx := 0 |
| 323 | for pfx < len(ra) && pfx < len(rb) && ra[pfx] == rb[pfx] { |
| 324 | pfx++ |
| 325 | } |
| 326 | // Common suffix (from the end, but don't overlap with prefix) |
| 327 | sfx := 0 |
| 328 | for sfx < len(ra)-pfx && sfx < len(rb)-pfx && ra[len(ra)-1-sfx] == rb[len(rb)-1-sfx] { |
| 329 | sfx++ |
| 330 | } |
| 331 | |
| 332 | midA := ra[pfx : len(ra)-sfx] |
| 333 | midB := rb[pfx : len(rb)-sfx] |
| 334 | |
| 335 | if len(midA) == 0 && len(midB) == 0 { |
| 336 | // Lines are identical — no highlighting needed |
| 337 | return a, b |
| 338 | } |
| 339 | |
| 340 | prefix := string(ra[:pfx]) |
| 341 | suffix := string(ra[len(ra)-sfx:]) |
| 342 | |
| 343 | hlA := prefix + ansiBold + ansiRedBg + string(midA) + ansiReset + ansiRed + suffix |
| 344 | hlB := prefix + ansiBold + ansiGrnBg + string(midB) + ansiReset + ansiGreen + suffix |
| 345 | |
| 346 | return hlA, hlB |
| 347 | } |
| 348 | |