pretty.go

v0.1.0
Doc Versions Source
1
package format
2
3
import (
4
	"strings"
5
	"unicode/utf8"
6
)
7
8
const defaultMaxLineWidth = 120
9
10
// Block elements get their own line and increase indentation for children.
11
var blockTags = map[string]bool{
12
	// Layout
13
	"ac:layout":         true,
14
	"ac:layout-section": true,
15
	"ac:layout-cell":    true,
16
	// Block content
17
	"p":  true,
18
	"h1": true, "h2": true, "h3": true, "h4": true, "h5": true, "h6": true,
19
	"div": true,
20
	// Lists
21
	"ul": true, "ol": true, "li": true,
22
	// Tables
23
	"table": true, "thead": true, "tbody": true, "colgroup": true,
24
	"tr": true, "th": true, "td": true,
25
	// Macros
26
	"ac:structured-macro": true,
27
	"ac:rich-text-body":   true,
28
	"ac:plain-text-body":  true,
29
	// Task lists
30
	"ac:task-list": true,
31
	"ac:task":      true,
32
	"ac:task-body": true,
33
}
34
35
// inlineableBlocks: block tags that prefer to stay on one line if short enough.
36
var inlineableBlocks = map[string]bool{
37
	"li": true, "th": true, "td": true,
38
	"h1": true, "h2": true, "h3": true, "h4": true, "h5": true, "h6": true,
39
	"ac:task-id": true, "ac:task-status": true,
40
}
41
42
// Pre elements: content inside is not reformatted.
43
var preTags = map[string]bool{
44
	"ac:plain-text-body": true,
45
}
46
47
// PrettyXML formats Confluence storage XML with sensible indentation.
48
func PrettyXML(input string, indent string) string {
49
	tokens := tokenize(input)
50
	var buf strings.Builder
51
	level := 0
52
	inPre := 0
53
	atLineStart := true
54
55
	i := 0
56
	for i < len(tokens) {
57
		tok := tokens[i]
58
59
		switch tok.kind {
60
		case tokenOpen:
61
			tagName := tok.tagName()
62
			if inPre > 0 {
63
				buf.WriteString(tok.raw)
64
				if preTags[tagName] {
65
					inPre++
66
				}
67
				i++
68
				continue
69
			}
70
			if preTags[tagName] {
71
				inPre++
72
				ensureIndentedLine(&buf, level, indent, &atLineStart)
73
				buf.WriteString(tok.raw)
74
				i++
75
				continue
76
			}
77
			if blockTags[tagName] {
78
				// Try to inline short blocks like <li>text</li>, <h1>Title</h1>
79
				if inlineableBlocks[tagName] {
80
					if inlined, skip := tryInlineBlock(tokens[i:], tagName); skip > 0 {
81
						ensureIndentedLine(&buf, level, indent, &atLineStart)
82
						buf.WriteString(inlined)
83
						buf.WriteString("\n")
84
						atLineStart = true
85
						i += skip
86
						continue
87
					}
88
				}
89
				ensureIndentedLine(&buf, level, indent, &atLineStart)
90
				buf.WriteString(tok.raw)
91
				buf.WriteString("\n")
92
				level++
93
				atLineStart = true
94
			} else {
95
				if atLineStart {
96
					writeIndentPrefix(&buf, level, indent)
97
					atLineStart = false
98
				}
99
				buf.WriteString(tok.raw)
100
			}
101
102
		case tokenClose:
103
			tagName := tok.tagName()
104
			if inPre > 0 {
105
				buf.WriteString(tok.raw)
106
				if preTags[tagName] {
107
					inPre--
108
				}
109
				i++
110
				continue
111
			}
112
			if blockTags[tagName] {
113
				level--
114
				if level < 0 {
115
					level = 0
116
				}
117
				if !atLineStart {
118
					buf.WriteString("\n")
119
				}
120
				writeIndentPrefix(&buf, level, indent)
121
				buf.WriteString(tok.raw)
122
				buf.WriteString("\n")
123
				atLineStart = true
124
			} else {
125
				buf.WriteString(tok.raw)
126
			}
127
128
		case tokenSelfClose:
129
			tagName := tok.tagName()
130
			if inPre > 0 {
131
				buf.WriteString(tok.raw)
132
				i++
133
				continue
134
			}
135
			if blockTags[tagName] || tagName == "hr" || tagName == "col" {
136
				ensureIndentedLine(&buf, level, indent, &atLineStart)
137
				buf.WriteString(tok.raw)
138
				buf.WriteString("\n")
139
				atLineStart = true
140
			} else {
141
				if atLineStart {
142
					writeIndentPrefix(&buf, level, indent)
143
					atLineStart = false
144
				}
145
				buf.WriteString(tok.raw)
146
			}
147
148
		case tokenText:
149
			if inPre > 0 {
150
				buf.WriteString(tok.raw)
151
				i++
152
				continue
153
			}
154
			text := collapseWS(tok.raw)
155
			if text == "" || text == " " {
156
				i++
157
				continue
158
			}
159
			if atLineStart {
160
				text = strings.TrimLeft(text, " ")
161
				if text == "" {
162
					i++
163
					continue
164
				}
165
				writeIndentPrefix(&buf, level, indent)
166
				atLineStart = false
167
			}
168
			buf.WriteString(text)
169
170
		case tokenCDATA, tokenComment:
171
			if inPre > 0 {
172
				buf.WriteString(tok.raw)
173
				i++
174
				continue
175
			}
176
			if atLineStart {
177
				writeIndentPrefix(&buf, level, indent)
178
				atLineStart = false
179
			}
180
			buf.WriteString(tok.raw)
181
		}
182
183
		i++
184
	}
185
186
	result := buf.String()
187
	// Post-process: clean up lines and wrap long ones
188
	lines := strings.Split(result, "\n")
189
	var final []string
190
	for _, line := range lines {
191
		line = strings.TrimRight(line, " \t")
192
		if runeWidth(line) > defaultMaxLineWidth {
193
			final = append(final, wrapLine(line, defaultMaxLineWidth)...)
194
		} else {
195
			final = append(final, line)
196
		}
197
	}
198
	return strings.TrimSpace(strings.Join(final, "\n")) + "\n"
199
}
200
201
// tryInlineBlock checks if the block starting at tokens[0] (an open tag) has
202
// only inline/text children and a matching close tag, and the total is short
203
// enough to fit on one line. Returns the inlined string and number of tokens consumed.
204
func tryInlineBlock(tokens []token, tagName string) (string, int) {
205
	if len(tokens) < 2 {
206
		return "", 0
207
	}
208
	// Scan forward to find matching close tag
209
	depth := 0
210
	var inner strings.Builder
211
	for j, tok := range tokens {
212
		if j == 0 {
213
			inner.WriteString(tok.raw)
214
			depth = 1
215
			continue
216
		}
217
		switch tok.kind {
218
		case tokenOpen:
219
			tn := tok.tagName()
220
			if blockTags[tn] && !inlineableBlocks[tn] {
221
				// Contains a non-inlineable block child — can't inline
222
				return "", 0
223
			}
224
			if tn == tagName {
225
				depth++
226
			}
227
			inner.WriteString(tok.raw)
228
		case tokenClose:
229
			tn := tok.tagName()
230
			if tn == tagName {
231
				depth--
232
				if depth == 0 {
233
					inner.WriteString(tok.raw)
234
					result := inner.String()
235
					if runeWidth(result) <= defaultMaxLineWidth {
236
						return result, j + 1
237
					}
238
					return "", 0
239
				}
240
			}
241
			inner.WriteString(tok.raw)
242
		case tokenText:
243
			text := collapseWS(tok.raw)
244
			if text == "" {
245
				continue
246
			}
247
			// Trim leading space only for the first text token after open tag
248
			if j == 1 {
249
				text = strings.TrimLeft(text, " ")
250
			}
251
			inner.WriteString(text)
252
		case tokenCDATA:
253
			// CDATA in an inlineable block — don't inline if multiline
254
			if strings.Contains(tok.raw, "\n") {
255
				return "", 0
256
			}
257
			inner.WriteString(tok.raw)
258
		default:
259
			inner.WriteString(tok.raw)
260
		}
261
	}
262
	return "", 0
263
}
264
265
// wrapLine splits a long line at word boundaries, preserving leading indentation.
266
// It is XML-aware: it won't break inside tags (< ... >).
267
func wrapLine(line string, maxWidth int) []string {
268
	// Extract leading indentation
269
	trimmed := strings.TrimLeft(line, " \t")
270
	indentStr := line[:len(line)-len(trimmed)]
271
	contIndent := indentStr + "  " // continuation lines get extra indent
272
273
	// Split into segments: tags (unsplittable) and text (splittable at spaces)
274
	segments := splitSegments(trimmed)
275
276
	var lines []string
277
	var cur strings.Builder
278
	cur.WriteString(indentStr)
279
	curWidth := runeWidth(indentStr)
280
281
	for _, seg := range segments {
282
		segW := runeWidth(seg)
283
284
		if seg == "" {
285
			continue
286
		}
287
288
		// Tags and non-space text: never break inside
289
		if strings.HasPrefix(seg, "<") {
290
			// If adding this tag exceeds limit and we have content, wrap
291
			if curWidth+segW > maxWidth && curWidth > runeWidth(indentStr) {
292
				lines = append(lines, strings.TrimRight(cur.String(), " "))
293
				cur.Reset()
294
				cur.WriteString(contIndent)
295
				curWidth = runeWidth(contIndent)
296
			}
297
			cur.WriteString(seg)
298
			curWidth += segW
299
			continue
300
		}
301
302
		// Text segment: split at word boundaries
303
		words := strings.Fields(seg)
304
		// Preserve leading space if original had one
305
		needSpace := len(seg) > 0 && seg[0] == ' '
306
307
		for _, word := range words {
308
			wordW := runeWidth(word)
309
			spaceW := 0
310
			if needSpace {
311
				spaceW = 1
312
			}
313
314
			if curWidth+spaceW+wordW > maxWidth && curWidth > runeWidth(contIndent) {
315
				lines = append(lines, strings.TrimRight(cur.String(), " "))
316
				cur.Reset()
317
				cur.WriteString(contIndent)
318
				curWidth = runeWidth(contIndent)
319
				needSpace = false
320
			}
321
322
			if needSpace {
323
				cur.WriteByte(' ')
324
				curWidth++
325
			}
326
			cur.WriteString(word)
327
			curWidth += wordW
328
			needSpace = true
329
		}
330
	}
331
332
	if cur.Len() > 0 {
333
		final := strings.TrimRight(cur.String(), " ")
334
		if final != "" {
335
			lines = append(lines, final)
336
		}
337
	}
338
339
	if len(lines) == 0 {
340
		return []string{line}
341
	}
342
	return lines
343
}
344
345
// splitSegments breaks text into alternating tag and text segments.
346
// E.g. "Hello <strong>world</strong> end" -> ["Hello ", "<strong>", "world", "</strong>", " end"]
347
func splitSegments(s string) []string {
348
	var segs []string
349
	for len(s) > 0 {
350
		lt := strings.Index(s, "<")
351
		if lt == -1 {
352
			segs = append(segs, s)
353
			break
354
		}
355
		if lt > 0 {
356
			segs = append(segs, s[:lt])
357
		}
358
		gt := strings.Index(s[lt:], ">")
359
		if gt == -1 {
360
			segs = append(segs, s[lt:])
361
			break
362
		}
363
		segs = append(segs, s[lt:lt+gt+1])
364
		s = s[lt+gt+1:]
365
	}
366
	return segs
367
}
368
369
func runeWidth(s string) int {
370
	return utf8.RuneCountInString(s)
371
}
372
373
func ensureIndentedLine(buf *strings.Builder, level int, indent string, atLineStart *bool) {
374
	if !*atLineStart {
375
		buf.WriteString("\n")
376
	}
377
	writeIndentPrefix(buf, level, indent)
378
	*atLineStart = false
379
}
380
381
func writeIndentPrefix(buf *strings.Builder, level int, indent string) {
382
	for range level {
383
		buf.WriteString(indent)
384
	}
385
}
386
387
func collapseWS(s string) string {
388
	var buf strings.Builder
389
	inWS := false
390
	for _, r := range s {
391
		if r == ' ' || r == '\t' || r == '\n' || r == '\r' {
392
			if !inWS {
393
				buf.WriteByte(' ')
394
				inWS = true
395
			}
396
		} else {
397
			buf.WriteRune(r)
398
			inWS = false
399
		}
400
	}
401
	return buf.String()
402
}
403
404
// Token types for the XML tokenizer.
405
type tokenKind int
406
407
const (
408
	tokenOpen      tokenKind = iota // <tag ...>
409
	tokenClose                      // </tag>
410
	tokenSelfClose                  // <tag .../>
411
	tokenText                       // plain text
412
	tokenCDATA                      // <![CDATA[...]]>
413
	tokenComment                    // <!-- ... -->
414
)
415
416
type token struct {
417
	kind tokenKind
418
	raw  string
419
}
420
421
func (t token) tagName() string {
422
	s := t.raw
423
	switch t.kind {
424
	case tokenOpen, tokenSelfClose:
425
		s = s[1:]
426
		if strings.HasSuffix(s, "/>") {
427
			s = s[:len(s)-2]
428
		} else {
429
			s = strings.TrimSuffix(s, ">")
430
		}
431
		if idx := strings.IndexAny(s, " \t\n"); idx > 0 {
432
			s = s[:idx]
433
		}
434
		return strings.ToLower(s)
435
	case tokenClose:
436
		s = s[2:]
437
		s = strings.TrimSuffix(s, ">")
438
		return strings.ToLower(strings.TrimSpace(s))
439
	}
440
	return ""
441
}
442
443
func tokenize(input string) []token {
444
	var tokens []token
445
	i := 0
446
	for i < len(input) {
447
		if input[i] == '<' {
448
			if strings.HasPrefix(input[i:], "<![CDATA[") {
449
				end := strings.Index(input[i:], "]]>")
450
				if end == -1 {
451
					tokens = append(tokens, token{tokenCDATA, input[i:]})
452
					break
453
				}
454
				tokens = append(tokens, token{tokenCDATA, input[i : i+end+3]})
455
				i += end + 3
456
				continue
457
			}
458
			if strings.HasPrefix(input[i:], "<!--") {
459
				end := strings.Index(input[i:], "-->")
460
				if end == -1 {
461
					tokens = append(tokens, token{tokenComment, input[i:]})
462
					break
463
				}
464
				tokens = append(tokens, token{tokenComment, input[i : i+end+3]})
465
				i += end + 3
466
				continue
467
			}
468
			end := strings.Index(input[i:], ">")
469
			if end == -1 {
470
				tokens = append(tokens, token{tokenText, input[i:]})
471
				break
472
			}
473
			tagStr := input[i : i+end+1]
474
			if strings.HasPrefix(tagStr, "</") {
475
				tokens = append(tokens, token{tokenClose, tagStr})
476
			} else if strings.HasSuffix(tagStr, "/>") {
477
				tokens = append(tokens, token{tokenSelfClose, tagStr})
478
			} else {
479
				tokens = append(tokens, token{tokenOpen, tagStr})
480
			}
481
			i += end + 1
482
		} else {
483
			end := strings.Index(input[i:], "<")
484
			if end == -1 {
485
				tokens = append(tokens, token{tokenText, input[i:]})
486
				break
487
			}
488
			tokens = append(tokens, token{tokenText, input[i : i+end]})
489
			i += end
490
		}
491
	}
492
	return tokens
493
}
494

Source Files