mirror of
				https://gitcode.com/gitea/gitea.git
				synced 2025-10-26 13:16:28 +08:00 
			
		
		
		
	Detect charset and convert non UTF-8 files for display (#4950)
* Detect charset and convert non UTF-8 files for display * Refactor and move function to correct module * Revert unrelated changes * More unrelated changes * Duplicate content for small text to have better encoding detection * Check if original content is valid before duplicating it
This commit is contained in:
		| @ -59,7 +59,22 @@ func DetectEncoding(content []byte) (string, error) { | |||||||
| 		return "UTF-8", nil | 		return "UTF-8", nil | ||||||
| 	} | 	} | ||||||
|  |  | ||||||
| 	result, err := chardet.NewTextDetector().DetectBest(content) | 	textDetector := chardet.NewTextDetector() | ||||||
|  | 	var detectContent []byte | ||||||
|  | 	if len(content) < 1024 { | ||||||
|  | 		// Check if original content is valid | ||||||
|  | 		if _, err := textDetector.DetectBest(content); err != nil { | ||||||
|  | 			return "", err | ||||||
|  | 		} | ||||||
|  | 		times := 1024 / len(content) | ||||||
|  | 		detectContent = make([]byte, 0, times*len(content)) | ||||||
|  | 		for i := 0; i < times; i++ { | ||||||
|  | 			detectContent = append(detectContent, content...) | ||||||
|  | 		} | ||||||
|  | 	} else { | ||||||
|  | 		detectContent = content | ||||||
|  | 	} | ||||||
|  | 	result, err := textDetector.DetectBest(detectContent) | ||||||
| 	if err != nil { | 	if err != nil { | ||||||
| 		return "", err | 		return "", err | ||||||
| 	} | 	} | ||||||
|  | |||||||
| @ -1,3 +1,4 @@ | |||||||
|  | // Copyright 2018 The Gitea Authors. All rights reserved. | ||||||
| // Copyright 2014 The Gogs Authors. All rights reserved. | // Copyright 2014 The Gogs Authors. All rights reserved. | ||||||
| // Use of this source code is governed by a MIT-style | // Use of this source code is governed by a MIT-style | ||||||
| // license that can be found in the LICENSE file. | // license that can be found in the LICENSE file. | ||||||
| @ -275,7 +276,7 @@ func ToUTF8WithErr(content []byte) (string, error) { | |||||||
| 	} | 	} | ||||||
|  |  | ||||||
| 	// If there is an error, we concatenate the nicely decoded part and the | 	// If there is an error, we concatenate the nicely decoded part and the | ||||||
| 	// original left over. This way we won't loose data. | 	// original left over. This way we won't lose data. | ||||||
| 	result, n, err := transform.String(encoding.NewDecoder(), string(content)) | 	result, n, err := transform.String(encoding.NewDecoder(), string(content)) | ||||||
| 	if err != nil { | 	if err != nil { | ||||||
| 		result = result + string(content[n:]) | 		result = result + string(content[n:]) | ||||||
| @ -284,6 +285,28 @@ func ToUTF8WithErr(content []byte) (string, error) { | |||||||
| 	return result, err | 	return result, err | ||||||
| } | } | ||||||
|  |  | ||||||
|  | // ToUTF8WithFallback detects the encoding of content and coverts to UTF-8 if possible | ||||||
|  | func ToUTF8WithFallback(content []byte) []byte { | ||||||
|  | 	charsetLabel, err := base.DetectEncoding(content) | ||||||
|  | 	if err != nil || charsetLabel == "UTF-8" { | ||||||
|  | 		return content | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  | 	encoding, _ := charset.Lookup(charsetLabel) | ||||||
|  | 	if encoding == nil { | ||||||
|  | 		return content | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  | 	// If there is an error, we concatenate the nicely decoded part and the | ||||||
|  | 	// original left over. This way we won't lose data. | ||||||
|  | 	result, n, err := transform.Bytes(encoding.NewDecoder(), content) | ||||||
|  | 	if err != nil { | ||||||
|  | 		return append(result, content[n:]...) | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  | 	return result | ||||||
|  | } | ||||||
|  |  | ||||||
| // ToUTF8 converts content to UTF8 encoding and ignore error | // ToUTF8 converts content to UTF8 encoding and ignore error | ||||||
| func ToUTF8(content string) string { | func ToUTF8(content string) string { | ||||||
| 	res, _ := ToUTF8WithErr([]byte(content)) | 	res, _ := ToUTF8WithErr([]byte(content)) | ||||||
|  | |||||||
| @ -25,6 +25,7 @@ import ( | |||||||
| 	"code.gitea.io/gitea/modules/markup" | 	"code.gitea.io/gitea/modules/markup" | ||||||
| 	"code.gitea.io/gitea/modules/setting" | 	"code.gitea.io/gitea/modules/setting" | ||||||
| 	"code.gitea.io/gitea/modules/templates" | 	"code.gitea.io/gitea/modules/templates" | ||||||
|  |  | ||||||
| 	"github.com/Unknwon/paginater" | 	"github.com/Unknwon/paginater" | ||||||
| ) | ) | ||||||
|  |  | ||||||
| @ -99,7 +100,8 @@ func renderDirectory(ctx *context.Context, treeLink string) { | |||||||
| 				ctx.Data["FileSize"] = readmeFile.Size() | 				ctx.Data["FileSize"] = readmeFile.Size() | ||||||
| 			} else { | 			} else { | ||||||
| 				d, _ := ioutil.ReadAll(dataRc) | 				d, _ := ioutil.ReadAll(dataRc) | ||||||
| 				buf = append(buf, d...) | 				buf = templates.ToUTF8WithFallback(append(buf, d...)) | ||||||
|  |  | ||||||
| 				if markup.Type(readmeFile.Name()) != "" { | 				if markup.Type(readmeFile.Name()) != "" { | ||||||
| 					ctx.Data["IsMarkup"] = true | 					ctx.Data["IsMarkup"] = true | ||||||
| 					ctx.Data["FileContent"] = string(markup.Render(readmeFile.Name(), buf, treeLink, ctx.Repo.Repository.ComposeMetas())) | 					ctx.Data["FileContent"] = string(markup.Render(readmeFile.Name(), buf, treeLink, ctx.Repo.Repository.ComposeMetas())) | ||||||
| @ -203,7 +205,7 @@ func renderFile(ctx *context.Context, entry *git.TreeEntry, treeLink, rawLink st | |||||||
| 		} | 		} | ||||||
|  |  | ||||||
| 		d, _ := ioutil.ReadAll(dataRc) | 		d, _ := ioutil.ReadAll(dataRc) | ||||||
| 		buf = append(buf, d...) | 		buf = templates.ToUTF8WithFallback(append(buf, d...)) | ||||||
|  |  | ||||||
| 		readmeExist := markup.IsReadmeFile(blob.Name()) | 		readmeExist := markup.IsReadmeFile(blob.Name()) | ||||||
| 		ctx.Data["ReadmeExist"] = readmeExist | 		ctx.Data["ReadmeExist"] = readmeExist | ||||||
|  | |||||||
		Reference in New Issue
	
	Block a user
	 Lauris BH
					Lauris BH