mirror of
https://github.com/HeidiSQL/HeidiSQL.git
synced 2025-08-06 18:24:26 +08:00
Fix bug #574 "UTF-8 textfiles without BOM are read as ANSI files". Split out character set detection to GetFileCharset and leave TStream creating in OpenTextFile().
This commit is contained in:
@ -96,6 +96,7 @@ type
|
||||
procedure SetWindowSizeGrip(hWnd: HWND; Enable: boolean);
|
||||
procedure SaveUnicodeFile(Filename: String; Text: WideString);
|
||||
procedure OpenTextFile(const Filename: String; out Stream: TFileStream; out FileCharset: TFileCharset);
|
||||
function GetFileCharset(Stream: TFileStream): TFileCharset;
|
||||
function ReadTextfileChunk(Stream: TFileStream; FileCharset: TFileCharset; ChunkSize: Int64 = 0): WideString;
|
||||
function ReadTextfile(Filename: String): WideString;
|
||||
|
||||
@ -2382,18 +2383,50 @@ end;
|
||||
Open a textfile unicode safe and return a stream + its charset
|
||||
}
|
||||
procedure OpenTextFile(const Filename: String; out Stream: TFileStream; out FileCharset: TFileCharset);
|
||||
begin
|
||||
Stream := TFileStream.Create(Filename, fmOpenRead or fmShareDenyWrite);
|
||||
Stream.Position := 0;
|
||||
FileCharset := GetFileCharset(Stream);
|
||||
end;
|
||||
|
||||
|
||||
{**
|
||||
Detect a file's character set which can be
|
||||
UTF-16 BE with BOM
|
||||
UTF-16 LE with BOM
|
||||
UTF-8 with or without BOM
|
||||
ANSI
|
||||
@see http://en.wikipedia.org/wiki/Byte_Order_Mark
|
||||
}
|
||||
function GetFileCharset(Stream: TFileStream): TFileCharset;
|
||||
var
|
||||
ByteOrderMark: WideChar;
|
||||
BytesRead: Integer;
|
||||
Utf8Test: array[0..2] of AnsiChar;
|
||||
Utf8Test: AnsiString; //array[0..2] of AnsiChar;
|
||||
Buffer: array of Byte;
|
||||
BufferSize, i, FoundUTF8Strings: Integer;
|
||||
const
|
||||
UNICODE_BOM = WideChar($FEFF);
|
||||
UNICODE_BOM_SWAPPED = WideChar($FFFE);
|
||||
UTF8_BOM = AnsiString(#$EF#$BB#$BF);
|
||||
begin
|
||||
Stream := TFileStream.Create(Filename, fmOpenRead or fmShareDenyWrite);
|
||||
Stream.Position := 0;
|
||||
MinimumCountOfUTF8Strings = 1;
|
||||
MaxBufferSize = $4000;
|
||||
|
||||
// 3 trailing bytes are the maximum in valid UTF-8 streams,
|
||||
// so a count of 4 trailing bytes is enough to detect invalid UTF-8 streams
|
||||
function CountOfTrailingBytes: Integer;
|
||||
begin
|
||||
Result := 0;
|
||||
inc(i);
|
||||
while (i < BufferSize) and (Result < 4) do begin
|
||||
if Buffer[i] in [$80..$BF] then
|
||||
inc(Result)
|
||||
else
|
||||
Break;
|
||||
inc(i);
|
||||
end;
|
||||
end;
|
||||
begin
|
||||
// Byte Order Mark
|
||||
ByteOrderMark := #0;
|
||||
if (Stream.Size - Stream.Position) >= SizeOf(ByteOrderMark) then begin
|
||||
@ -2410,15 +2443,104 @@ begin
|
||||
end;
|
||||
// Test Byte Order Mark
|
||||
if ByteOrderMark = UNICODE_BOM then
|
||||
FileCharset := fcsUnicode
|
||||
Result := fcsUnicode
|
||||
else if ByteOrderMark = UNICODE_BOM_SWAPPED then
|
||||
FileCharset := fcsUnicodeSwapped
|
||||
Result := fcsUnicodeSwapped
|
||||
else if Utf8Test = UTF8_BOM then
|
||||
FileCharset := fcsUtf8
|
||||
else
|
||||
FileCharset := fcsAnsi;
|
||||
end;
|
||||
Result := fcsUtf8
|
||||
else begin
|
||||
{ @note Taken from SynUnicode.pas }
|
||||
{ If no BOM was found, check for leading/trailing byte sequences,
|
||||
which are uncommon in usual non UTF-8 encoded text.
|
||||
|
||||
NOTE: There is no 100% save way to detect UTF-8 streams. The bigger
|
||||
MinimumCountOfUTF8Strings, the lower is the probability of
|
||||
a false positive. On the other hand, a big MinimumCountOfUTF8Strings
|
||||
makes it unlikely to detect files with only little usage of non
|
||||
US-ASCII chars, like usual in European languages. }
|
||||
|
||||
// if no special characteristics are found it is not UTF-8
|
||||
Result := fcsAnsi;
|
||||
|
||||
// if Stream is nil, let Delphi raise the exception, by accessing Stream,
|
||||
// to signal an invalid result
|
||||
|
||||
// start analysis at actual Stream.Position
|
||||
BufferSize := Min(MaxBufferSize, Stream.Size - Stream.Position);
|
||||
|
||||
if BufferSize > 0 then begin
|
||||
SetLength(Buffer, BufferSize);
|
||||
Stream.ReadBuffer(Buffer[0], BufferSize);
|
||||
Stream.Seek(-BufferSize, soFromCurrent);
|
||||
|
||||
FoundUTF8Strings := 0;
|
||||
i := 0;
|
||||
while i < BufferSize do begin
|
||||
if FoundUTF8Strings = MinimumCountOfUTF8Strings then begin
|
||||
Result := fcsUtf8;
|
||||
Break;
|
||||
end;
|
||||
case Buffer[i] of
|
||||
$00..$7F: // skip US-ASCII characters as they could belong to various charsets
|
||||
;
|
||||
$C2..$DF:
|
||||
if CountOfTrailingBytes = 1 then
|
||||
inc(FoundUTF8Strings)
|
||||
else
|
||||
Break;
|
||||
$E0:
|
||||
begin
|
||||
inc(i);
|
||||
if (i < BufferSize) and (Buffer[i] in [$A0..$BF]) and (CountOfTrailingBytes = 1) then
|
||||
inc(FoundUTF8Strings)
|
||||
else
|
||||
Break;
|
||||
end;
|
||||
$E1..$EC, $EE..$EF:
|
||||
if CountOfTrailingBytes = 2 then
|
||||
inc(FoundUTF8Strings)
|
||||
else
|
||||
Break;
|
||||
$ED:
|
||||
begin
|
||||
inc(i);
|
||||
if (i < BufferSize) and (Buffer[i] in [$80..$9F]) and (CountOfTrailingBytes = 1) then
|
||||
inc(FoundUTF8Strings)
|
||||
else
|
||||
Break;
|
||||
end;
|
||||
$F0:
|
||||
begin
|
||||
inc(i);
|
||||
if (i < BufferSize) and (Buffer[i] in [$90..$BF]) and (CountOfTrailingBytes = 2) then
|
||||
inc(FoundUTF8Strings)
|
||||
else
|
||||
Break;
|
||||
end;
|
||||
$F1..$F3:
|
||||
if CountOfTrailingBytes = 3 then
|
||||
inc(FoundUTF8Strings)
|
||||
else
|
||||
Break;
|
||||
$F4:
|
||||
begin
|
||||
inc(i);
|
||||
if (i < BufferSize) and (Buffer[i] in [$80..$8F]) and (CountOfTrailingBytes = 2) then
|
||||
inc(FoundUTF8Strings)
|
||||
else
|
||||
Break;
|
||||
end;
|
||||
$C0, $C1, $F5..$FF: // invalid UTF-8 bytes
|
||||
Break;
|
||||
$80..$BF: // trailing bytes are consumed when handling leading bytes,
|
||||
// any occurence of "orphaned" trailing bytes is invalid UTF-8
|
||||
Break;
|
||||
end;
|
||||
inc(i);
|
||||
end;
|
||||
end;
|
||||
end;
|
||||
end;
|
||||
|
||||
{**
|
||||
Read a chunk out of a textfile unicode safe by passing a stream and its charset
|
||||
|
Reference in New Issue
Block a user