mirror of
				https://gitcode.com/gitea/gitea.git
				synced 2025-10-26 05:04:27 +08:00 
			
		
		
		
	 3310dd1d19
			
		
	
	3310dd1d19
	
	
	
		
			
			Use Unicode placeholders to replace HTML tags and HTML entities first, then do diff, then recover the HTML tags and HTML entities. Now the code diff with highlight has stable behavior, and won't emit broken tags.
		
			
				
	
	
		
			224 lines
		
	
	
		
			7.9 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
			
		
		
	
	
			224 lines
		
	
	
		
			7.9 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
| // Copyright 2022 The Gitea Authors. All rights reserved.
 | |
| // Use of this source code is governed by a MIT-style
 | |
| // license that can be found in the LICENSE file.
 | |
| 
 | |
| package gitdiff
 | |
| 
 | |
| import (
 | |
| 	"strings"
 | |
| 
 | |
| 	"code.gitea.io/gitea/modules/highlight"
 | |
| 
 | |
| 	"github.com/sergi/go-diff/diffmatchpatch"
 | |
| )
 | |
| 
 | |
| // token is a html tag or entity, eg: "<span ...>", "</span>", "<"
 | |
| func extractHTMLToken(s string) (before, token, after string, valid bool) {
 | |
| 	for pos1 := 0; pos1 < len(s); pos1++ {
 | |
| 		if s[pos1] == '<' {
 | |
| 			pos2 := strings.IndexByte(s[pos1:], '>')
 | |
| 			if pos2 == -1 {
 | |
| 				return "", "", s, false
 | |
| 			}
 | |
| 			return s[:pos1], s[pos1 : pos1+pos2+1], s[pos1+pos2+1:], true
 | |
| 		} else if s[pos1] == '&' {
 | |
| 			pos2 := strings.IndexByte(s[pos1:], ';')
 | |
| 			if pos2 == -1 {
 | |
| 				return "", "", s, false
 | |
| 			}
 | |
| 			return s[:pos1], s[pos1 : pos1+pos2+1], s[pos1+pos2+1:], true
 | |
| 		}
 | |
| 	}
 | |
| 	return "", "", s, true
 | |
| }
 | |
| 
 | |
| // highlightCodeDiff is used to do diff with highlighted HTML code.
 | |
| // It totally depends on Chroma's valid HTML output and its structure, do not use these functions for other purposes.
 | |
| // The HTML tags and entities will be replaced by Unicode placeholders: "<span>{TEXT}</span>" => "\uE000{TEXT}\uE001"
 | |
| // These Unicode placeholders are friendly to the diff.
 | |
| // Then after diff, the placeholders in diff result will be recovered to the HTML tags and entities.
 | |
| // It's guaranteed that the tags in final diff result are paired correctly.
 | |
| type highlightCodeDiff struct {
 | |
| 	placeholderBegin    rune
 | |
| 	placeholderMaxCount int
 | |
| 	placeholderIndex    int
 | |
| 	placeholderTokenMap map[rune]string
 | |
| 	tokenPlaceholderMap map[string]rune
 | |
| 
 | |
| 	placeholderOverflowCount int
 | |
| 
 | |
| 	lineWrapperTags []string
 | |
| }
 | |
| 
 | |
| func newHighlightCodeDiff() *highlightCodeDiff {
 | |
| 	return &highlightCodeDiff{
 | |
| 		placeholderBegin:    rune(0x100000), // Plane 16: Supplementary Private Use Area B (U+100000..U+10FFFD)
 | |
| 		placeholderMaxCount: 64000,
 | |
| 		placeholderTokenMap: map[rune]string{},
 | |
| 		tokenPlaceholderMap: map[string]rune{},
 | |
| 	}
 | |
| }
 | |
| 
 | |
| // nextPlaceholder returns 0 if no more placeholder can be used
 | |
| // the diff is done line by line, usually there are only a few (no more than 10) placeholders in one line
 | |
| // so the placeholderMaxCount is impossible to be exhausted in real cases.
 | |
| func (hcd *highlightCodeDiff) nextPlaceholder() rune {
 | |
| 	for hcd.placeholderIndex < hcd.placeholderMaxCount {
 | |
| 		r := hcd.placeholderBegin + rune(hcd.placeholderIndex)
 | |
| 		hcd.placeholderIndex++
 | |
| 		// only use non-existing (not used by code) rune as placeholders
 | |
| 		if _, ok := hcd.placeholderTokenMap[r]; !ok {
 | |
| 			return r
 | |
| 		}
 | |
| 	}
 | |
| 	return 0 // no more available placeholder
 | |
| }
 | |
| 
 | |
| func (hcd *highlightCodeDiff) isInPlaceholderRange(r rune) bool {
 | |
| 	return hcd.placeholderBegin <= r && r < hcd.placeholderBegin+rune(hcd.placeholderMaxCount)
 | |
| }
 | |
| 
 | |
| func (hcd *highlightCodeDiff) collectUsedRunes(code string) {
 | |
| 	for _, r := range code {
 | |
| 		if hcd.isInPlaceholderRange(r) {
 | |
| 			// put the existing rune (used by code) in map, then this rune won't be used a placeholder anymore.
 | |
| 			hcd.placeholderTokenMap[r] = ""
 | |
| 		}
 | |
| 	}
 | |
| }
 | |
| 
 | |
| func (hcd *highlightCodeDiff) diffWithHighlight(filename, language, codeA, codeB string) []diffmatchpatch.Diff {
 | |
| 	hcd.collectUsedRunes(codeA)
 | |
| 	hcd.collectUsedRunes(codeB)
 | |
| 
 | |
| 	highlightCodeA := highlight.Code(filename, language, codeA)
 | |
| 	highlightCodeB := highlight.Code(filename, language, codeB)
 | |
| 
 | |
| 	highlightCodeA = hcd.convertToPlaceholders(highlightCodeA)
 | |
| 	highlightCodeB = hcd.convertToPlaceholders(highlightCodeB)
 | |
| 
 | |
| 	diffs := diffMatchPatch.DiffMain(highlightCodeA, highlightCodeB, true)
 | |
| 	diffs = diffMatchPatch.DiffCleanupEfficiency(diffs)
 | |
| 
 | |
| 	for i := range diffs {
 | |
| 		hcd.recoverOneDiff(&diffs[i])
 | |
| 	}
 | |
| 	return diffs
 | |
| }
 | |
| 
 | |
| // convertToPlaceholders totally depends on Chroma's valid HTML output and its structure, do not use these functions for other purposes.
 | |
| func (hcd *highlightCodeDiff) convertToPlaceholders(htmlCode string) string {
 | |
| 	var tagStack []string
 | |
| 	res := strings.Builder{}
 | |
| 
 | |
| 	firstRunForLineTags := hcd.lineWrapperTags == nil
 | |
| 
 | |
| 	var beforeToken, token string
 | |
| 	var valid bool
 | |
| 
 | |
| 	// the standard chroma highlight HTML is "<span class="line [hl]"><span class="cl"> ... </span></span>"
 | |
| 	for {
 | |
| 		beforeToken, token, htmlCode, valid = extractHTMLToken(htmlCode)
 | |
| 		if !valid || token == "" {
 | |
| 			break
 | |
| 		}
 | |
| 		// write the content before the token into result string, and consume the token in the string
 | |
| 		res.WriteString(beforeToken)
 | |
| 
 | |
| 		// the line wrapper tags should be removed before diff
 | |
| 		if strings.HasPrefix(token, `<span class="line`) || strings.HasPrefix(token, `<span class="cl"`) {
 | |
| 			if firstRunForLineTags {
 | |
| 				// if this is the first run for converting, save the line wrapper tags for later use, they should be added back
 | |
| 				hcd.lineWrapperTags = append(hcd.lineWrapperTags, token)
 | |
| 			}
 | |
| 			htmlCode = strings.TrimSuffix(htmlCode, "</span>")
 | |
| 			continue
 | |
| 		}
 | |
| 
 | |
| 		var tokenInMap string
 | |
| 		if strings.HasSuffix(token, "</") { // for closing tag
 | |
| 			if len(tagStack) == 0 {
 | |
| 				break // invalid diff result, no opening tag but see closing tag
 | |
| 			}
 | |
| 			// make sure the closing tag in map is related to the open tag, to make the diff algorithm can match the opening/closing tags
 | |
| 			// the closing tag will be recorded in the map by key "</span><!-- <span the-opening> -->" for "<span the-opening>"
 | |
| 			tokenInMap = token + "<!-- " + tagStack[len(tagStack)-1] + "-->"
 | |
| 			tagStack = tagStack[:len(tagStack)-1]
 | |
| 		} else if token[0] == '<' { // for opening tag
 | |
| 			tokenInMap = token
 | |
| 			tagStack = append(tagStack, token)
 | |
| 		} else if token[0] == '&' { // for html entity
 | |
| 			tokenInMap = token
 | |
| 		} // else: impossible
 | |
| 
 | |
| 		// remember the placeholder and token in the map
 | |
| 		placeholder, ok := hcd.tokenPlaceholderMap[tokenInMap]
 | |
| 		if !ok {
 | |
| 			placeholder = hcd.nextPlaceholder()
 | |
| 			if placeholder != 0 {
 | |
| 				hcd.tokenPlaceholderMap[tokenInMap] = placeholder
 | |
| 				hcd.placeholderTokenMap[placeholder] = tokenInMap
 | |
| 			}
 | |
| 		}
 | |
| 
 | |
| 		if placeholder != 0 {
 | |
| 			res.WriteRune(placeholder) // use the placeholder to replace the token
 | |
| 		} else {
 | |
| 			// unfortunately, all private use runes has been exhausted, no more placeholder could be used, no more converting
 | |
| 			// usually, the exhausting won't occur in real cases, the magnitude of used placeholders is not larger than that of the CSS classes outputted by chroma.
 | |
| 			hcd.placeholderOverflowCount++
 | |
| 			if strings.HasPrefix(token, "&") {
 | |
| 				// when the token is a html entity, something must be outputted even if there is no placeholder.
 | |
| 				res.WriteRune(0xFFFD)      // replacement character TODO: how to handle this case more gracefully?
 | |
| 				res.WriteString(token[1:]) // still output the entity code part, otherwise there will be no diff result.
 | |
| 			}
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	// write the remaining string
 | |
| 	res.WriteString(htmlCode)
 | |
| 	return res.String()
 | |
| }
 | |
| 
 | |
| func (hcd *highlightCodeDiff) recoverOneDiff(diff *diffmatchpatch.Diff) {
 | |
| 	sb := strings.Builder{}
 | |
| 	var tagStack []string
 | |
| 
 | |
| 	for _, r := range diff.Text {
 | |
| 		token, ok := hcd.placeholderTokenMap[r]
 | |
| 		if !ok || token == "" {
 | |
| 			sb.WriteRune(r) // if the rune is not a placeholder, write it as it is
 | |
| 			continue
 | |
| 		}
 | |
| 		var tokenToRecover string
 | |
| 		if strings.HasPrefix(token, "</") { // for closing tag
 | |
| 			// only get the tag itself, ignore the trailing comment (for how the comment is generated, see the code in `convert` function)
 | |
| 			tokenToRecover = token[:strings.IndexByte(token, '>')+1]
 | |
| 			if len(tagStack) == 0 {
 | |
| 				continue // if no opening tag in stack yet, skip the closing tag
 | |
| 			}
 | |
| 			tagStack = tagStack[:len(tagStack)-1]
 | |
| 		} else if token[0] == '<' { // for opening tag
 | |
| 			tokenToRecover = token
 | |
| 			tagStack = append(tagStack, token)
 | |
| 		} else if token[0] == '&' { // for html entity
 | |
| 			tokenToRecover = token
 | |
| 		} // else: impossible
 | |
| 		sb.WriteString(tokenToRecover)
 | |
| 	}
 | |
| 
 | |
| 	if len(tagStack) > 0 {
 | |
| 		// close all opening tags
 | |
| 		for i := len(tagStack) - 1; i >= 0; i-- {
 | |
| 			tagToClose := tagStack[i]
 | |
| 			// get the closing tag "</span>" from "<span class=...>" or "<span>"
 | |
| 			pos := strings.IndexAny(tagToClose, " >")
 | |
| 			if pos != -1 {
 | |
| 				sb.WriteString("</" + tagToClose[1:pos] + ">")
 | |
| 			} // else: impossible. every tag was pushed into the stack by the code above and is valid HTML opening tag
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	diff.Text = sb.String()
 | |
| }
 |