xml2md.go

v0.1.0
Doc Versions Source
1
package converter
2
3
import (
4
	"bytes"
5
	"fmt"
6
	htmlpkg "html"
7
	"strings"
8
9
	"golang.org/x/net/html"
10
)
11
12
// ConfluenceToMarkdown converts Confluence storage format XML to Markdown.
13
func ConfluenceToMarkdown(source string) (string, error) {
14
	// Preprocess: extract CDATA content and replace with escaped text,
15
	// because x/net/html doesn't handle CDATA sections.
16
	preprocessed := preprocessCDATA(source)
17
18
	// Wrap in a root element so the HTML parser handles it correctly.
19
	wrapped := "<div>" + preprocessed + "</div>"
20
	doc, err := html.Parse(strings.NewReader(wrapped))
21
	if err != nil {
22
		return "", fmt.Errorf("parsing confluence xml: %w", err)
23
	}
24
25
	var buf bytes.Buffer
26
	c := &xmlConverter{buf: &buf}
27
28
	// Navigate to the wrapper div: html > head > body > div
29
	body := findNode(doc, "body")
30
	if body == nil {
31
		return "", fmt.Errorf("unexpected parse structure")
32
	}
33
	wrapper := body.FirstChild
34
	if wrapper != nil {
35
		c.walkChildren(wrapper, 0)
36
	}
37
38
	result := buf.String()
39
	// Clean up excessive blank lines
40
	for strings.Contains(result, "\n\n\n") {
41
		result = strings.ReplaceAll(result, "\n\n\n", "\n\n")
42
	}
43
	return strings.TrimSpace(result) + "\n", nil
44
}
45
46
// preprocessCDATA replaces <![CDATA[...]]> with the content as a data attribute
47
// on the parent element, since x/net/html doesn't parse CDATA.
48
func preprocessCDATA(s string) string {
49
	var result strings.Builder
50
	for {
51
		idx := strings.Index(s, "<![CDATA[")
52
		if idx == -1 {
53
			result.WriteString(s)
54
			break
55
		}
56
		result.WriteString(s[:idx])
57
		s = s[idx+len("<![CDATA["):]
58
		endIdx := strings.Index(s, "]]>")
59
		if endIdx == -1 {
60
			result.WriteString(s)
61
			break
62
		}
63
		// Write CDATA content as a special element that we can detect
64
		content := s[:endIdx]
65
		result.WriteString("<cdatacontent>")
66
		result.WriteString(htmlpkg.EscapeString(content))
67
		result.WriteString("</cdatacontent>")
68
		s = s[endIdx+len("]]>"):]
69
	}
70
	return result.String()
71
}
72
73
type xmlConverter struct {
74
	buf        *bytes.Buffer
75
	listDepth  int
76
	inListItem bool
77
}
78
79
func (c *xmlConverter) walkChildren(n *html.Node, depth int) {
80
	for child := n.FirstChild; child != nil; child = child.NextSibling {
81
		c.walk(child, depth)
82
	}
83
}
84
85
func (c *xmlConverter) walk(n *html.Node, depth int) {
86
	if n.Type == html.TextNode {
87
		text := n.Data
88
		// Skip whitespace-only text nodes inside lists
89
		if c.listDepth > 0 && strings.TrimSpace(text) == "" {
90
			return
91
		}
92
		// Collapse whitespace in text nodes (XML indentation artifacts)
93
		if strings.TrimSpace(text) != "" {
94
			// Replace sequences of whitespace (including newlines) with single space,
95
			// but preserve the trimmed content
96
			text = collapseWhitespace(text)
97
		}
98
		c.buf.WriteString(text)
99
		return
100
	}
101
102
	if n.Type != html.ElementNode {
103
		c.walkChildren(n, depth)
104
		return
105
	}
106
107
	tag := strings.ToLower(n.Data)
108
109
	switch {
110
	// Headings
111
	case tag == "h1":
112
		c.buf.WriteString("\n# ")
113
		c.walkChildren(n, depth)
114
		c.buf.WriteString("\n\n")
115
	case tag == "h2":
116
		c.buf.WriteString("\n## ")
117
		c.walkChildren(n, depth)
118
		c.buf.WriteString("\n\n")
119
	case tag == "h3":
120
		c.buf.WriteString("\n### ")
121
		c.walkChildren(n, depth)
122
		c.buf.WriteString("\n\n")
123
	case tag == "h4":
124
		c.buf.WriteString("\n#### ")
125
		c.walkChildren(n, depth)
126
		c.buf.WriteString("\n\n")
127
	case tag == "h5":
128
		c.buf.WriteString("\n##### ")
129
		c.walkChildren(n, depth)
130
		c.buf.WriteString("\n\n")
131
	case tag == "h6":
132
		c.buf.WriteString("\n###### ")
133
		c.walkChildren(n, depth)
134
		c.buf.WriteString("\n\n")
135
136
	// Paragraphs
137
	case tag == "p":
138
		c.walkChildren(n, depth)
139
		if !c.inListItem {
140
			c.buf.WriteString("\n\n")
141
		}
142
143
	// Inline formatting
144
	case tag == "strong", tag == "b":
145
		c.buf.WriteString("**")
146
		c.walkChildren(n, depth)
147
		c.buf.WriteString("**")
148
	case tag == "em", tag == "i":
149
		c.buf.WriteString("*")
150
		c.walkChildren(n, depth)
151
		c.buf.WriteString("*")
152
	case tag == "del", tag == "s":
153
		c.buf.WriteString("~~")
154
		c.walkChildren(n, depth)
155
		c.buf.WriteString("~~")
156
	case tag == "code":
157
		if !isPrevSiblingCode(n) {
158
			c.buf.WriteString("`")
159
		}
160
		c.walkChildren(n, depth)
161
		if !isNextSiblingCode(n) {
162
			c.buf.WriteString("`")
163
		}
164
165
	// Links
166
	case tag == "a":
167
		href := getAttr(n, "href")
168
		c.buf.WriteString("[")
169
		c.walkChildren(n, depth)
170
		c.buf.WriteString("](")
171
		c.buf.WriteString(href)
172
		c.buf.WriteString(")")
173
174
	// Line break
175
	case tag == "br":
176
		c.buf.WriteString("  \n")
177
178
	// Horizontal rule
179
	case tag == "hr":
180
		c.buf.WriteString("\n---\n\n")
181
182
	// Lists
183
	case tag == "ul":
184
		c.listDepth++
185
		if c.listDepth == 1 {
186
			c.buf.WriteString("\n")
187
		}
188
		c.walkChildren(n, depth)
189
		c.listDepth--
190
		if c.listDepth == 0 {
191
			c.buf.WriteString("\n")
192
		}
193
	case tag == "ol":
194
		c.listDepth++
195
		if c.listDepth == 1 {
196
			c.buf.WriteString("\n")
197
		}
198
		c.walkOL(n, depth)
199
		c.listDepth--
200
		if c.listDepth == 0 {
201
			c.buf.WriteString("\n")
202
		}
203
	case tag == "li":
204
		prev := c.inListItem
205
		c.inListItem = true
206
		// Check if this list item contains a task checkbox
207
		if hasTaskStatus(n) {
208
			// Task status handler will write the prefix, walkChildrenInline for text
209
			c.walkChildrenInline(n, depth)
210
			c.buf.WriteString("\n")
211
		} else {
212
			indent := strings.Repeat("  ", max(0, c.listDepth-1))
213
			c.buf.WriteString(indent)
214
			c.buf.WriteString("- ")
215
			c.walkChildrenInline(n, depth)
216
			c.buf.WriteString("\n")
217
		}
218
		c.inListItem = prev
219
220
	// Tables - convert to GFM table
221
	case tag == "table":
222
		c.renderTable(n, depth)
223
224
	// Confluence macros - handled via ac:* namespace (parsed as ac-*)
225
	// The HTML parser lowercases and handles colons differently.
226
	// We need to handle both ac:structured-macro and the parsed form.
227
228
	// Skip layout/structural elements, pass through children
229
	case tag == "div", tag == "span", tag == "tbody", tag == "thead",
230
		tag == "colgroup", tag == "col", tag == "content-wrapper":
231
		c.walkChildren(n, depth)
232
233
	// Handle Confluence-specific elements
234
	default:
235
		c.handleConfluenceElement(n, tag, depth)
236
	}
237
}
238
239
func (c *xmlConverter) handleConfluenceElement(n *html.Node, tag string, depth int) {
240
	switch {
241
	// Layout elements — preserve as HTML comments for round-trip
242
	case strings.Contains(tag, "ac:layout-section") || strings.Contains(tag, "layout-section"):
243
		sectionType := getAttr(n, "ac:type")
244
		if sectionType == "" {
245
			sectionType = getAttr(n, "type")
246
		}
247
		fmt.Fprintf(c.buf, "<!-- ac:layout-section type=%q -->\n", sectionType)
248
		c.walkChildren(n, depth)
249
		c.buf.WriteString("<!-- /ac:layout-section -->\n")
250
251
	case strings.Contains(tag, "ac:layout-cell") || strings.Contains(tag, "layout-cell"):
252
		c.buf.WriteString("<!-- ac:layout-cell -->\n")
253
		c.walkChildren(n, depth)
254
		c.buf.WriteString("<!-- /ac:layout-cell -->\n")
255
256
	case tag == "ac:layout" || strings.Contains(tag, "layout") && !strings.Contains(tag, "layout-"):
257
		c.buf.WriteString("<!-- ac:layout -->\n")
258
		c.walkChildren(n, depth)
259
		c.buf.WriteString("<!-- /ac:layout -->\n")
260
	// Confluence structured macros (code blocks, panels, etc.)
261
	case strings.Contains(tag, "structured-macro") || strings.Contains(tag, "ac:structured-macro"):
262
		macroName := getAttr(n, "ac:name")
263
		if macroName == "" {
264
			macroName = getAttr(n, "name")
265
		}
266
		macroID := getAttr(n, "ac:macro-id")
267
		if macroID == "" {
268
			macroID = getAttr(n, "macro-id")
269
		}
270
		switch macroName {
271
		case "code":
272
			c.renderCodeMacro(n, macroID)
273
		case "info":
274
			c.renderPanelAsBlockquote(n, depth)
275
		case "note":
276
			c.renderPanelAsBlockquote(n, depth)
277
		case "warning":
278
			c.renderPanelAsBlockquote(n, depth)
279
		case "toc":
280
			// Preserve TOC macro as HTML comment
281
			if macroID != "" {
282
				fmt.Fprintf(c.buf, "<!-- ac:toc macro-id=%q -->\n", macroID)
283
			} else {
284
				c.buf.WriteString("<!-- ac:toc -->\n")
285
			}
286
		default:
287
			c.walkChildren(n, depth)
288
		}
289
290
	// Confluence images
291
	case strings.Contains(tag, "image") || strings.Contains(tag, "ac:image"):
292
		alt := getAttr(n, "ac:alt")
293
		if alt == "" {
294
			alt = getAttr(n, "alt")
295
		}
296
		imgRef := c.findImageRef(n)
297
		if imgRef.isAttachment {
298
			// Preserve attachment reference as round-trippable HTML
299
			fmt.Fprintf(c.buf, `<span data-attachment="%s"`, imgRef.filename)
300
			if alt != "" {
301
				fmt.Fprintf(c.buf, ` data-alt="%s"`, alt)
302
			}
303
			c.buf.WriteString("/>")
304
		} else {
305
			c.buf.WriteString("![")
306
			c.buf.WriteString(alt)
307
			c.buf.WriteString("](")
308
			c.buf.WriteString(imgRef.url)
309
			c.buf.WriteString(")")
310
		}
311
312
	// Confluence links (user mentions, page links)
313
	case strings.Contains(tag, "ac:link"):
314
		if c.hasUserChild(n) {
315
			c.walkChildren(n, depth)
316
		} else {
317
			c.walkChildren(n, depth)
318
		}
319
320
	// Confluence emoticons
321
	case strings.Contains(tag, "emoticon") || strings.Contains(tag, "ac:emoticon"):
322
		name := getAttr(n, "ac:name")
323
		if name == "" {
324
			name = getAttr(n, "name")
325
		}
326
		switch name {
327
		case "plus":
328
			c.buf.WriteString("(+)")
329
		case "minus":
330
			c.buf.WriteString("(-)")
331
		case "question":
332
			c.buf.WriteString("(?)")
333
		case "tick":
334
			c.buf.WriteString("(v)")
335
		case "cross":
336
			c.buf.WriteString("(x)")
337
		}
338
339
	// Confluence task lists
340
	case strings.Contains(tag, "task-list"):
341
		c.listDepth++
342
		c.walkChildren(n, depth)
343
		c.listDepth--
344
	case strings.Contains(tag, "task-body"):
345
		c.walkChildren(n, depth)
346
		c.buf.WriteString("\n")
347
	case strings.Contains(tag, "task-status"):
348
		status := strings.TrimSpace(getTextContent(n))
349
		indent := strings.Repeat("  ", max(0, c.listDepth-1))
350
		if status == "complete" {
351
			c.buf.WriteString(indent + "- [x] ")
352
		} else {
353
			c.buf.WriteString(indent + "- [ ] ")
354
		}
355
	case strings.Contains(tag, "task-id"):
356
		// Skip task IDs
357
	case strings.Contains(tag, "task") && !strings.Contains(tag, "task-"):
358
		c.walkChildren(n, depth)
359
360
	// Confluence inline comment markers — preserve as span with data attribute
361
	case strings.Contains(tag, "inline-comment-marker"):
362
		ref := getAttr(n, "ac:ref")
363
		if ref == "" {
364
			ref = getAttr(n, "ref")
365
		}
366
		if ref != "" {
367
			fmt.Fprintf(c.buf, `<span data-inline-comment="%s">`, ref)
368
			c.walkChildren(n, depth)
369
			c.buf.WriteString("</span>")
370
		} else {
371
			c.walkChildren(n, depth)
372
		}
373
374
	// User references — preserve as round-trippable HTML span
375
	case strings.Contains(tag, "ri:user"):
376
		userKey := getAttr(n, "ri:userkey")
377
		if userKey == "" {
378
			userKey = getAttr(n, "userkey")
379
		}
380
		if userKey != "" {
381
			fmt.Fprintf(c.buf, `<span data-user-key="%s"/>`, userKey)
382
		}
383
384
	// Time elements
385
	case tag == "time":
386
		datetime := getAttr(n, "datetime")
387
		if datetime != "" {
388
			c.buf.WriteString(datetime)
389
		}
390
391
	// Fallback: just walk children
392
	default:
393
		c.walkChildren(n, depth)
394
	}
395
}
396
397
func (c *xmlConverter) renderCodeMacro(n *html.Node, macroID string) {
398
	language := ""
399
	code := ""
400
401
	// Walk children to find parameters and body
402
	var walkMacro func(*html.Node)
403
	walkMacro = func(node *html.Node) {
404
		if node.Type == html.ElementNode {
405
			tag := strings.ToLower(node.Data)
406
			if strings.Contains(tag, "parameter") || strings.Contains(tag, "ac:parameter") {
407
				name := getAttr(node, "ac:name")
408
				if name == "" {
409
					name = getAttr(node, "name")
410
				}
411
				if name == "language" {
412
					language = getTextContent(node)
413
				}
414
			}
415
			if strings.Contains(tag, "plain-text-body") || strings.Contains(tag, "ac:plain-text-body") {
416
				code = getCDATAContent(node)
417
			}
418
		}
419
		for child := node.FirstChild; child != nil; child = child.NextSibling {
420
			walkMacro(child)
421
		}
422
	}
423
	walkMacro(n)
424
425
	// Extract original attribute order for round-trip fidelity
426
	attrOrder := extractAttrOrder(n)
427
428
	if macroID != "" {
429
		if attrOrder != "" {
430
			fmt.Fprintf(c.buf, "\n<!-- ac:code macro-id=%q attr-order=%q -->\n", macroID, attrOrder)
431
		} else {
432
			fmt.Fprintf(c.buf, "\n<!-- ac:code macro-id=%q -->\n", macroID)
433
		}
434
	} else {
435
		c.buf.WriteString("\n")
436
	}
437
	c.buf.WriteString("```")
438
	c.buf.WriteString(language)
439
	c.buf.WriteString("\n")
440
	c.buf.WriteString(code)
441
	if !strings.HasSuffix(code, "\n") {
442
		c.buf.WriteString("\n")
443
	}
444
	c.buf.WriteString("```\n\n")
445
}
446
447
func (c *xmlConverter) renderPanelAsBlockquote(n *html.Node, depth int) {
448
	// Collect panel body content
449
	var bodyBuf bytes.Buffer
450
	origBuf := c.buf
451
	c.buf = &bodyBuf
452
453
	// Find rich-text-body and walk it
454
	var findBody func(*html.Node)
455
	findBody = func(node *html.Node) {
456
		if node.Type == html.ElementNode {
457
			tag := strings.ToLower(node.Data)
458
			if strings.Contains(tag, "rich-text-body") {
459
				c.walkChildren(node, depth)
460
				return
461
			}
462
		}
463
		for child := node.FirstChild; child != nil; child = child.NextSibling {
464
			findBody(child)
465
		}
466
	}
467
	findBody(n)
468
469
	c.buf = origBuf
470
	text := strings.TrimSpace(bodyBuf.String())
471
	lines := strings.Split(text, "\n")
472
	for _, line := range lines {
473
		c.buf.WriteString("> ")
474
		c.buf.WriteString(line)
475
		c.buf.WriteString("\n")
476
	}
477
	c.buf.WriteString("\n")
478
}
479
480
func (c *xmlConverter) renderTable(n *html.Node, depth int) {
481
	rows := collectTableRows(n)
482
	if len(rows) == 0 {
483
		return
484
	}
485
486
	// Determine column count
487
	cols := 0
488
	for _, row := range rows {
489
		if len(row.cells) > cols {
490
			cols = len(row.cells)
491
		}
492
	}
493
	if cols == 0 {
494
		return
495
	}
496
497
	// Preserve table attributes and colgroup as HTML comment
498
	tableAttrs := extractTableAttrs(n)
499
	if tableAttrs != "" {
500
		fmt.Fprintf(c.buf, "\n<!-- table-attrs: %s -->\n", tableAttrs)
501
	} else {
502
		c.buf.WriteString("\n")
503
	}
504
505
	// If first row is a header
506
	isFirstRowHeader := len(rows) > 0 && rows[0].isHeader
507
	startIdx := 0
508
509
	if isFirstRowHeader {
510
		c.writeTableRow(rows[0].cells, cols)
511
		c.writeTableSep(cols)
512
		startIdx = 1
513
	} else {
514
		// Write empty header and separator
515
		empty := make([]string, cols)
516
		c.writeTableRow(empty, cols)
517
		c.writeTableSep(cols)
518
	}
519
520
	for i := startIdx; i < len(rows); i++ {
521
		c.writeTableRow(rows[i].cells, cols)
522
	}
523
	c.buf.WriteString("\n")
524
}
525
526
func (c *xmlConverter) writeTableRow(cells []string, cols int) {
527
	c.buf.WriteString("|")
528
	for i := range cols {
529
		cell := ""
530
		if i < len(cells) {
531
			cell = cells[i]
532
		}
533
		c.buf.WriteString(" ")
534
		c.buf.WriteString(cell)
535
		c.buf.WriteString(" |")
536
	}
537
	c.buf.WriteString("\n")
538
}
539
540
func (c *xmlConverter) writeTableSep(cols int) {
541
	c.buf.WriteString("|")
542
	for range cols {
543
		c.buf.WriteString("---|")
544
	}
545
	c.buf.WriteString("\n")
546
}
547
548
func (c *xmlConverter) walkOL(n *html.Node, depth int) {
549
	idx := 1
550
	for child := n.FirstChild; child != nil; child = child.NextSibling {
551
		if child.Type != html.ElementNode {
552
			continue
553
		}
554
		tag := strings.ToLower(child.Data)
555
		if tag == "li" {
556
			indent := strings.Repeat("  ", max(0, c.listDepth-1))
557
			c.buf.WriteString(indent)
558
			fmt.Fprintf(c.buf, "%d. ", idx)
559
			c.walkChildrenInline(child, depth)
560
			c.buf.WriteString("\n")
561
			idx++
562
		}
563
	}
564
}
565
566
func (c *xmlConverter) walkChildrenInline(n *html.Node, depth int) {
567
	for child := n.FirstChild; child != nil; child = child.NextSibling {
568
		if child.Type == html.TextNode {
569
			// Collapse whitespace but preserve a single space between inline elements
570
			text := collapseWhitespace(child.Data)
571
			// Only trim leading space if this is the very first child
572
			if child == n.FirstChild {
573
				text = strings.TrimLeft(text, " ")
574
			}
575
			// Only trim trailing space if this is the very last child
576
			if child.NextSibling == nil {
577
				text = strings.TrimRight(text, " ")
578
			}
579
			if text != "" {
580
				c.buf.WriteString(text)
581
			}
582
			continue
583
		}
584
		if child.Type == html.ElementNode {
585
			tag := strings.ToLower(child.Data)
586
			switch {
587
			case tag == "p":
588
				c.walkChildrenInline(child, depth)
589
			case tag == "ul", tag == "ol":
590
				c.buf.WriteString("\n")
591
				c.walk(child, depth)
592
			default:
593
				c.walk(child, depth)
594
			}
595
		}
596
	}
597
}
598
599
// extractTableAttrs extracts class, style, and colgroup info as a JSON-like string for preservation.
600
func extractTableAttrs(table *html.Node) string {
601
	var parts []string
602
603
	// Table class and style
604
	cls := getAttr(table, "class")
605
	style := getAttr(table, "style")
606
	if cls != "" {
607
		parts = append(parts, fmt.Sprintf("class=%q", cls))
608
	}
609
	if style != "" {
610
		parts = append(parts, fmt.Sprintf("style=%q", style))
611
	}
612
613
	// Colgroup
614
	var colWidths []string
615
	for child := table.FirstChild; child != nil; child = child.NextSibling {
616
		if child.Type == html.ElementNode && strings.ToLower(child.Data) == "colgroup" {
617
			for col := child.FirstChild; col != nil; col = col.NextSibling {
618
				if col.Type == html.ElementNode && strings.ToLower(col.Data) == "col" {
619
					colStyle := getAttr(col, "style")
620
					if colStyle != "" {
621
						colWidths = append(colWidths, colStyle)
622
					}
623
				}
624
			}
625
		}
626
	}
627
	if len(colWidths) > 0 {
628
		parts = append(parts, fmt.Sprintf("cols=[%s]", strings.Join(colWidths, "|")))
629
	}
630
631
	return strings.Join(parts, " ")
632
}
633
634
type tableRow struct {
635
	isHeader bool
636
	cells    []string
637
}
638
639
func collectTableRows(table *html.Node) []tableRow {
640
	var rows []tableRow
641
	var walk func(*html.Node, bool)
642
	walk = func(n *html.Node, inHeader bool) {
643
		if n.Type == html.ElementNode {
644
			tag := strings.ToLower(n.Data)
645
			switch tag {
646
			case "thead":
647
				for child := n.FirstChild; child != nil; child = child.NextSibling {
648
					walk(child, true)
649
				}
650
				return
651
			case "tbody":
652
				for child := n.FirstChild; child != nil; child = child.NextSibling {
653
					walk(child, false)
654
				}
655
				return
656
			case "tr":
657
				row := tableRow{isHeader: inHeader}
658
				for child := n.FirstChild; child != nil; child = child.NextSibling {
659
					if child.Type == html.ElementNode {
660
						cellTag := strings.ToLower(child.Data)
661
						if cellTag == "th" {
662
							row.isHeader = true
663
							row.cells = append(row.cells, strings.TrimSpace(renderCellMarkdown(child)))
664
						} else if cellTag == "td" {
665
							row.cells = append(row.cells, strings.TrimSpace(renderCellMarkdown(child)))
666
						}
667
					}
668
				}
669
				rows = append(rows, row)
670
				return
671
			}
672
		}
673
		for child := n.FirstChild; child != nil; child = child.NextSibling {
674
			walk(child, inHeader)
675
		}
676
	}
677
	walk(table, false)
678
	return rows
679
}
680
681
// renderCellMarkdown renders cell content to inline markdown, preserving
682
// formatting like bold, italic, code, links, br, and user references.
683
func renderCellMarkdown(cell *html.Node) string {
684
	var buf bytes.Buffer
685
	renderCellNode(&buf, cell)
686
	return buf.String()
687
}
688
689
func renderCellNode(buf *bytes.Buffer, n *html.Node) {
690
	for child := n.FirstChild; child != nil; child = child.NextSibling {
691
		switch child.Type {
692
		case html.TextNode:
693
			text := collapseWhitespace(child.Data)
694
			buf.WriteString(text)
695
		case html.ElementNode:
696
			tag := strings.ToLower(child.Data)
697
			switch {
698
			case tag == "strong" || tag == "b":
699
				buf.WriteString("**")
700
				renderCellNode(buf, child)
701
				buf.WriteString("**")
702
			case tag == "em" || tag == "i":
703
				buf.WriteString("*")
704
				renderCellNode(buf, child)
705
				buf.WriteString("*")
706
			case tag == "del" || tag == "s":
707
				buf.WriteString("~~")
708
				renderCellNode(buf, child)
709
				buf.WriteString("~~")
710
			case tag == "code":
711
				buf.WriteString("`")
712
				buf.WriteString(getTextContent(child))
713
				buf.WriteString("`")
714
			case tag == "a":
715
				href := getAttr(child, "href")
716
				buf.WriteString("[")
717
				renderCellNode(buf, child)
718
				buf.WriteString("](")
719
				buf.WriteString(href)
720
				buf.WriteString(")")
721
			case tag == "br":
722
				buf.WriteString("<br/>")
723
			case tag == "p":
724
				// Unwrap <p> inside cells
725
				renderCellNode(buf, child)
726
			case tag == "div":
727
				renderCellNode(buf, child)
728
			case strings.Contains(tag, "user"):
729
				userKey := getAttr(child, "ri:userkey")
730
				if userKey == "" {
731
					userKey = getAttr(child, "userkey")
732
				}
733
				if userKey != "" {
734
					fmt.Fprintf(buf, `<span data-user-key="%s"/>`, userKey)
735
				}
736
			case strings.Contains(tag, "ac:link"):
737
				renderCellNode(buf, child)
738
			case strings.Contains(tag, "image"):
739
				// Handle images in cells
740
				alt := getAttr(child, "ac:alt")
741
				if alt == "" {
742
					alt = getAttr(child, "alt")
743
				}
744
				var imgBuf bytes.Buffer
745
				c := &xmlConverter{buf: &imgBuf}
746
				ref := c.findImageRef(child)
747
				if ref.isAttachment {
748
					fmt.Fprintf(buf, `<span data-attachment="%s"`, ref.filename)
749
					if alt != "" {
750
						fmt.Fprintf(buf, ` data-alt="%s"`, alt)
751
					}
752
					buf.WriteString("/>")
753
				} else if ref.url != "" {
754
					buf.WriteString("![")
755
					buf.WriteString(alt)
756
					buf.WriteString("](")
757
					buf.WriteString(ref.url)
758
					buf.WriteString(")")
759
				}
760
			case strings.Contains(tag, "task-list"):
761
				renderCellTaskList(buf, child)
762
			case strings.Contains(tag, "emoticon"):
763
				name := getAttr(child, "ac:name")
764
				if name == "" {
765
					name = getAttr(child, "name")
766
				}
767
				switch name {
768
				case "plus":
769
					buf.WriteString("(+)")
770
				case "minus":
771
					buf.WriteString("(-)")
772
				case "question":
773
					buf.WriteString("(?)")
774
				case "tick":
775
					buf.WriteString("(v)")
776
				case "cross":
777
					buf.WriteString("(x)")
778
				}
779
			case strings.Contains(tag, "inline-comment-marker"):
780
				ref := getAttr(child, "ac:ref")
781
				if ref == "" {
782
					ref = getAttr(child, "ref")
783
				}
784
				if ref != "" {
785
					fmt.Fprintf(buf, `<span data-inline-comment="%s">`, ref)
786
					renderCellNode(buf, child)
787
					buf.WriteString("</span>")
788
				} else {
789
					renderCellNode(buf, child)
790
				}
791
			default:
792
				renderCellNode(buf, child)
793
			}
794
		}
795
	}
796
}
797
798
// renderCellTaskList renders a task list inside a table cell as inline markdown.
799
func renderCellTaskList(buf *bytes.Buffer, n *html.Node) {
800
	for child := n.FirstChild; child != nil; child = child.NextSibling {
801
		if child.Type != html.ElementNode {
802
			continue
803
		}
804
		tag := strings.ToLower(child.Data)
805
		if !strings.Contains(tag, "task") || strings.Contains(tag, "task-list") {
806
			continue
807
		}
808
		// This is an ac:task element
809
		status := ""
810
		var bodyContent string
811
		for tc := child.FirstChild; tc != nil; tc = tc.NextSibling {
812
			if tc.Type != html.ElementNode {
813
				continue
814
			}
815
			tcTag := strings.ToLower(tc.Data)
816
			if strings.Contains(tcTag, "task-status") {
817
				status = strings.TrimSpace(getTextContent(tc))
818
			} else if strings.Contains(tcTag, "task-body") {
819
				bodyContent = strings.TrimSpace(renderCellMarkdown(tc))
820
			}
821
		}
822
		check := "[ ]"
823
		if status == "complete" {
824
			check = "[x]"
825
		}
826
		fmt.Fprintf(buf, "- %s %s<br/>", check, bodyContent)
827
	}
828
}
829
830
type imageRef struct {
831
	url          string
832
	filename     string
833
	isAttachment bool
834
}
835
836
func (c *xmlConverter) findImageRef(n *html.Node) imageRef {
837
	var ref imageRef
838
	var walk func(*html.Node)
839
	walk = func(node *html.Node) {
840
		if node.Type == html.ElementNode {
841
			tag := strings.ToLower(node.Data)
842
			// <ri:url ri:value="..."/>
843
			if strings.Contains(tag, "url") {
844
				v := getAttr(node, "ri:value")
845
				if v == "" {
846
					v = getAttr(node, "value")
847
				}
848
				if v != "" {
849
					ref.url = v
850
					return
851
				}
852
			}
853
			// <ri:attachment ri:filename="..."/>
854
			if strings.Contains(tag, "attachment") {
855
				f := getAttr(node, "ri:filename")
856
				if f == "" {
857
					f = getAttr(node, "filename")
858
				}
859
				if f != "" {
860
					ref.filename = f
861
					ref.isAttachment = true
862
					return
863
				}
864
			}
865
		}
866
		for child := node.FirstChild; child != nil; child = child.NextSibling {
867
			walk(child)
868
		}
869
	}
870
	walk(n)
871
	return ref
872
}
873
874
func (c *xmlConverter) hasUserChild(n *html.Node) bool {
875
	for child := n.FirstChild; child != nil; child = child.NextSibling {
876
		if child.Type == html.ElementNode {
877
			tag := strings.ToLower(child.Data)
878
			if strings.Contains(tag, "user") {
879
				return true
880
			}
881
		}
882
	}
883
	return false
884
}
885
886
// Helper functions
887
888
func findNode(n *html.Node, tag string) *html.Node {
889
	if n.Type == html.ElementNode && n.Data == tag {
890
		return n
891
	}
892
	for child := n.FirstChild; child != nil; child = child.NextSibling {
893
		if found := findNode(child, tag); found != nil {
894
			return found
895
		}
896
	}
897
	return nil
898
}
899
900
func getAttr(n *html.Node, key string) string {
901
	for _, attr := range n.Attr {
902
		attrKey := attr.Key
903
		if attr.Namespace != "" {
904
			attrKey = attr.Namespace + ":" + attr.Key
905
		}
906
		if attrKey == key {
907
			return attr.Val
908
		}
909
	}
910
	return ""
911
}
912
913
// collapseWhitespace replaces runs of whitespace with a single space,
914
// preserving leading/trailing single space if original had whitespace there.
915
func collapseWhitespace(s string) string {
916
	var buf strings.Builder
917
	inWS := false
918
	for _, r := range s {
919
		if r == ' ' || r == '\t' || r == '\n' || r == '\r' {
920
			if !inWS {
921
				buf.WriteByte(' ')
922
				inWS = true
923
			}
924
		} else {
925
			buf.WriteRune(r)
926
			inWS = false
927
		}
928
	}
929
	return buf.String()
930
}
931
932
// hasTaskStatus checks if a node contains a task-status element.
933
func hasTaskStatus(n *html.Node) bool {
934
	for child := n.FirstChild; child != nil; child = child.NextSibling {
935
		if child.Type == html.ElementNode {
936
			tag := strings.ToLower(child.Data)
937
			if strings.Contains(tag, "task-status") {
938
				return true
939
			}
940
		}
941
	}
942
	return false
943
}
944
945
// getCDATAContent retrieves content from preprocessed CDATA sections.
946
// It looks for <cdatacontent> elements and unescapes their text.
947
func getCDATAContent(n *html.Node) string {
948
	var buf bytes.Buffer
949
	var walk func(*html.Node)
950
	walk = func(node *html.Node) {
951
		if node.Type == html.ElementNode && node.Data == "cdatacontent" {
952
			text := getTextContent(node)
953
			buf.WriteString(htmlpkg.UnescapeString(text))
954
			return
955
		}
956
		if node.Type == html.TextNode {
957
			buf.WriteString(node.Data)
958
		}
959
		for child := node.FirstChild; child != nil; child = child.NextSibling {
960
			walk(child)
961
		}
962
	}
963
	walk(n)
964
	return buf.String()
965
}
966
967
// extractAttrOrder returns a comma-separated list of short attribute names
968
// (e.g. "name,schema-version,macro-id") preserving the original order from the HTML node.
969
// The "ac:" prefix is stripped for brevity.
970
func extractAttrOrder(n *html.Node) string {
971
	var names []string
972
	for _, attr := range n.Attr {
973
		key := attr.Key
974
		if attr.Namespace != "" {
975
			key = attr.Namespace + ":" + attr.Key
976
		}
977
		short := strings.TrimPrefix(key, "ac:")
978
		names = append(names, short)
979
	}
980
	return strings.Join(names, ",")
981
}
982
983
// isNextSiblingCode checks if the next non-whitespace sibling is a <code> element.
984
func isNextSiblingCode(n *html.Node) bool {
985
	for s := n.NextSibling; s != nil; s = s.NextSibling {
986
		if s.Type == html.TextNode && strings.TrimSpace(s.Data) == "" {
987
			continue
988
		}
989
		return s.Type == html.ElementNode && strings.ToLower(s.Data) == "code"
990
	}
991
	return false
992
}
993
994
// isPrevSiblingCode checks if the previous non-whitespace sibling is a <code> element.
995
func isPrevSiblingCode(n *html.Node) bool {
996
	for s := n.PrevSibling; s != nil; s = s.PrevSibling {
997
		if s.Type == html.TextNode && strings.TrimSpace(s.Data) == "" {
998
			continue
999
		}
1000
		return s.Type == html.ElementNode && strings.ToLower(s.Data) == "code"
1001
	}
1002
	return false
1003
}
1004
1005
func getTextContent(n *html.Node) string {
1006
	var buf bytes.Buffer
1007
	var walk func(*html.Node)
1008
	walk = func(node *html.Node) {
1009
		if node.Type == html.TextNode {
1010
			buf.WriteString(node.Data)
1011
		}
1012
		for child := node.FirstChild; child != nil; child = child.NextSibling {
1013
			walk(child)
1014
		}
1015
	}
1016
	walk(n)
1017
	return buf.String()
1018
}
1019

Source Files