mirror of
https://github.com/HeidiSQL/HeidiSQL.git
synced 2025-08-06 18:24:26 +08:00
Fix bug #574 "UTF-8 textfiles without BOM are read as ANSI files". Split out character set detection to GetFileCharset and leave TStream creating in OpenTextFile().
This commit is contained in:
@ -96,6 +96,7 @@ type
|
|||||||
procedure SetWindowSizeGrip(hWnd: HWND; Enable: boolean);
|
procedure SetWindowSizeGrip(hWnd: HWND; Enable: boolean);
|
||||||
procedure SaveUnicodeFile(Filename: String; Text: WideString);
|
procedure SaveUnicodeFile(Filename: String; Text: WideString);
|
||||||
procedure OpenTextFile(const Filename: String; out Stream: TFileStream; out FileCharset: TFileCharset);
|
procedure OpenTextFile(const Filename: String; out Stream: TFileStream; out FileCharset: TFileCharset);
|
||||||
|
function GetFileCharset(Stream: TFileStream): TFileCharset;
|
||||||
function ReadTextfileChunk(Stream: TFileStream; FileCharset: TFileCharset; ChunkSize: Int64 = 0): WideString;
|
function ReadTextfileChunk(Stream: TFileStream; FileCharset: TFileCharset; ChunkSize: Int64 = 0): WideString;
|
||||||
function ReadTextfile(Filename: String): WideString;
|
function ReadTextfile(Filename: String): WideString;
|
||||||
|
|
||||||
@ -2382,18 +2383,50 @@ end;
|
|||||||
Open a textfile unicode safe and return a stream + its charset
|
Open a textfile unicode safe and return a stream + its charset
|
||||||
}
|
}
|
||||||
procedure OpenTextFile(const Filename: String; out Stream: TFileStream; out FileCharset: TFileCharset);
|
procedure OpenTextFile(const Filename: String; out Stream: TFileStream; out FileCharset: TFileCharset);
|
||||||
|
begin
|
||||||
|
Stream := TFileStream.Create(Filename, fmOpenRead or fmShareDenyWrite);
|
||||||
|
Stream.Position := 0;
|
||||||
|
FileCharset := GetFileCharset(Stream);
|
||||||
|
end;
|
||||||
|
|
||||||
|
|
||||||
|
{**
|
||||||
|
Detect a file's character set which can be
|
||||||
|
UTF-16 BE with BOM
|
||||||
|
UTF-16 LE with BOM
|
||||||
|
UTF-8 with or without BOM
|
||||||
|
ANSI
|
||||||
|
@see http://en.wikipedia.org/wiki/Byte_Order_Mark
|
||||||
|
}
|
||||||
|
function GetFileCharset(Stream: TFileStream): TFileCharset;
|
||||||
var
|
var
|
||||||
ByteOrderMark: WideChar;
|
ByteOrderMark: WideChar;
|
||||||
BytesRead: Integer;
|
BytesRead: Integer;
|
||||||
Utf8Test: array[0..2] of AnsiChar;
|
Utf8Test: AnsiString; //array[0..2] of AnsiChar;
|
||||||
|
Buffer: array of Byte;
|
||||||
|
BufferSize, i, FoundUTF8Strings: Integer;
|
||||||
const
|
const
|
||||||
UNICODE_BOM = WideChar($FEFF);
|
UNICODE_BOM = WideChar($FEFF);
|
||||||
UNICODE_BOM_SWAPPED = WideChar($FFFE);
|
UNICODE_BOM_SWAPPED = WideChar($FFFE);
|
||||||
UTF8_BOM = AnsiString(#$EF#$BB#$BF);
|
UTF8_BOM = AnsiString(#$EF#$BB#$BF);
|
||||||
begin
|
MinimumCountOfUTF8Strings = 1;
|
||||||
Stream := TFileStream.Create(Filename, fmOpenRead or fmShareDenyWrite);
|
MaxBufferSize = $4000;
|
||||||
Stream.Position := 0;
|
|
||||||
|
|
||||||
|
// 3 trailing bytes are the maximum in valid UTF-8 streams,
|
||||||
|
// so a count of 4 trailing bytes is enough to detect invalid UTF-8 streams
|
||||||
|
function CountOfTrailingBytes: Integer;
|
||||||
|
begin
|
||||||
|
Result := 0;
|
||||||
|
inc(i);
|
||||||
|
while (i < BufferSize) and (Result < 4) do begin
|
||||||
|
if Buffer[i] in [$80..$BF] then
|
||||||
|
inc(Result)
|
||||||
|
else
|
||||||
|
Break;
|
||||||
|
inc(i);
|
||||||
|
end;
|
||||||
|
end;
|
||||||
|
begin
|
||||||
// Byte Order Mark
|
// Byte Order Mark
|
||||||
ByteOrderMark := #0;
|
ByteOrderMark := #0;
|
||||||
if (Stream.Size - Stream.Position) >= SizeOf(ByteOrderMark) then begin
|
if (Stream.Size - Stream.Position) >= SizeOf(ByteOrderMark) then begin
|
||||||
@ -2410,15 +2443,104 @@ begin
|
|||||||
end;
|
end;
|
||||||
// Test Byte Order Mark
|
// Test Byte Order Mark
|
||||||
if ByteOrderMark = UNICODE_BOM then
|
if ByteOrderMark = UNICODE_BOM then
|
||||||
FileCharset := fcsUnicode
|
Result := fcsUnicode
|
||||||
else if ByteOrderMark = UNICODE_BOM_SWAPPED then
|
else if ByteOrderMark = UNICODE_BOM_SWAPPED then
|
||||||
FileCharset := fcsUnicodeSwapped
|
Result := fcsUnicodeSwapped
|
||||||
else if Utf8Test = UTF8_BOM then
|
else if Utf8Test = UTF8_BOM then
|
||||||
FileCharset := fcsUtf8
|
Result := fcsUtf8
|
||||||
else
|
else begin
|
||||||
FileCharset := fcsAnsi;
|
{ @note Taken from SynUnicode.pas }
|
||||||
end;
|
{ If no BOM was found, check for leading/trailing byte sequences,
|
||||||
|
which are uncommon in usual non UTF-8 encoded text.
|
||||||
|
|
||||||
|
NOTE: There is no 100% save way to detect UTF-8 streams. The bigger
|
||||||
|
MinimumCountOfUTF8Strings, the lower is the probability of
|
||||||
|
a false positive. On the other hand, a big MinimumCountOfUTF8Strings
|
||||||
|
makes it unlikely to detect files with only little usage of non
|
||||||
|
US-ASCII chars, like usual in European languages. }
|
||||||
|
|
||||||
|
// if no special characteristics are found it is not UTF-8
|
||||||
|
Result := fcsAnsi;
|
||||||
|
|
||||||
|
// if Stream is nil, let Delphi raise the exception, by accessing Stream,
|
||||||
|
// to signal an invalid result
|
||||||
|
|
||||||
|
// start analysis at actual Stream.Position
|
||||||
|
BufferSize := Min(MaxBufferSize, Stream.Size - Stream.Position);
|
||||||
|
|
||||||
|
if BufferSize > 0 then begin
|
||||||
|
SetLength(Buffer, BufferSize);
|
||||||
|
Stream.ReadBuffer(Buffer[0], BufferSize);
|
||||||
|
Stream.Seek(-BufferSize, soFromCurrent);
|
||||||
|
|
||||||
|
FoundUTF8Strings := 0;
|
||||||
|
i := 0;
|
||||||
|
while i < BufferSize do begin
|
||||||
|
if FoundUTF8Strings = MinimumCountOfUTF8Strings then begin
|
||||||
|
Result := fcsUtf8;
|
||||||
|
Break;
|
||||||
|
end;
|
||||||
|
case Buffer[i] of
|
||||||
|
$00..$7F: // skip US-ASCII characters as they could belong to various charsets
|
||||||
|
;
|
||||||
|
$C2..$DF:
|
||||||
|
if CountOfTrailingBytes = 1 then
|
||||||
|
inc(FoundUTF8Strings)
|
||||||
|
else
|
||||||
|
Break;
|
||||||
|
$E0:
|
||||||
|
begin
|
||||||
|
inc(i);
|
||||||
|
if (i < BufferSize) and (Buffer[i] in [$A0..$BF]) and (CountOfTrailingBytes = 1) then
|
||||||
|
inc(FoundUTF8Strings)
|
||||||
|
else
|
||||||
|
Break;
|
||||||
|
end;
|
||||||
|
$E1..$EC, $EE..$EF:
|
||||||
|
if CountOfTrailingBytes = 2 then
|
||||||
|
inc(FoundUTF8Strings)
|
||||||
|
else
|
||||||
|
Break;
|
||||||
|
$ED:
|
||||||
|
begin
|
||||||
|
inc(i);
|
||||||
|
if (i < BufferSize) and (Buffer[i] in [$80..$9F]) and (CountOfTrailingBytes = 1) then
|
||||||
|
inc(FoundUTF8Strings)
|
||||||
|
else
|
||||||
|
Break;
|
||||||
|
end;
|
||||||
|
$F0:
|
||||||
|
begin
|
||||||
|
inc(i);
|
||||||
|
if (i < BufferSize) and (Buffer[i] in [$90..$BF]) and (CountOfTrailingBytes = 2) then
|
||||||
|
inc(FoundUTF8Strings)
|
||||||
|
else
|
||||||
|
Break;
|
||||||
|
end;
|
||||||
|
$F1..$F3:
|
||||||
|
if CountOfTrailingBytes = 3 then
|
||||||
|
inc(FoundUTF8Strings)
|
||||||
|
else
|
||||||
|
Break;
|
||||||
|
$F4:
|
||||||
|
begin
|
||||||
|
inc(i);
|
||||||
|
if (i < BufferSize) and (Buffer[i] in [$80..$8F]) and (CountOfTrailingBytes = 2) then
|
||||||
|
inc(FoundUTF8Strings)
|
||||||
|
else
|
||||||
|
Break;
|
||||||
|
end;
|
||||||
|
$C0, $C1, $F5..$FF: // invalid UTF-8 bytes
|
||||||
|
Break;
|
||||||
|
$80..$BF: // trailing bytes are consumed when handling leading bytes,
|
||||||
|
// any occurence of "orphaned" trailing bytes is invalid UTF-8
|
||||||
|
Break;
|
||||||
|
end;
|
||||||
|
inc(i);
|
||||||
|
end;
|
||||||
|
end;
|
||||||
|
end;
|
||||||
|
end;
|
||||||
|
|
||||||
{**
|
{**
|
||||||
Read a chunk out of a textfile unicode safe by passing a stream and its charset
|
Read a chunk out of a textfile unicode safe by passing a stream and its charset
|
||||||
|
Reference in New Issue
Block a user