Fix bug #574 "UTF-8 textfiles without BOM are read as ANSI files". Split out character set detection to GetFileCharset and leave TStream creating in OpenTextFile().

This commit is contained in:
Ansgar Becker
2008-06-22 17:45:11 +00:00
parent fcdefa1f7a
commit 3a1d54da86

View File

@ -96,6 +96,7 @@ type
procedure SetWindowSizeGrip(hWnd: HWND; Enable: boolean);
procedure SaveUnicodeFile(Filename: String; Text: WideString);
procedure OpenTextFile(const Filename: String; out Stream: TFileStream; out FileCharset: TFileCharset);
function GetFileCharset(Stream: TFileStream): TFileCharset;
function ReadTextfileChunk(Stream: TFileStream; FileCharset: TFileCharset; ChunkSize: Int64 = 0): WideString;
function ReadTextfile(Filename: String): WideString;
@ -2382,18 +2383,50 @@ end;
Open a textfile unicode safe and return a stream + its charset
}
procedure OpenTextFile(const Filename: String; out Stream: TFileStream; out FileCharset: TFileCharset);
begin
Stream := TFileStream.Create(Filename, fmOpenRead or fmShareDenyWrite);
Stream.Position := 0;
FileCharset := GetFileCharset(Stream);
end;
{**
Detect a file's character set which can be
UTF-16 BE with BOM
UTF-16 LE with BOM
UTF-8 with or without BOM
ANSI
@see http://en.wikipedia.org/wiki/Byte_Order_Mark
}
function GetFileCharset(Stream: TFileStream): TFileCharset;
var
ByteOrderMark: WideChar;
BytesRead: Integer;
Utf8Test: array[0..2] of AnsiChar;
Utf8Test: AnsiString; //array[0..2] of AnsiChar;
Buffer: array of Byte;
BufferSize, i, FoundUTF8Strings: Integer;
const
UNICODE_BOM = WideChar($FEFF);
UNICODE_BOM_SWAPPED = WideChar($FFFE);
UTF8_BOM = AnsiString(#$EF#$BB#$BF);
begin
Stream := TFileStream.Create(Filename, fmOpenRead or fmShareDenyWrite);
Stream.Position := 0;
MinimumCountOfUTF8Strings = 1;
MaxBufferSize = $4000;
// 3 trailing bytes are the maximum in valid UTF-8 streams,
// so a count of 4 trailing bytes is enough to detect invalid UTF-8 streams
function CountOfTrailingBytes: Integer;
begin
Result := 0;
inc(i);
while (i < BufferSize) and (Result < 4) do begin
if Buffer[i] in [$80..$BF] then
inc(Result)
else
Break;
inc(i);
end;
end;
begin
// Byte Order Mark
ByteOrderMark := #0;
if (Stream.Size - Stream.Position) >= SizeOf(ByteOrderMark) then begin
@ -2410,15 +2443,104 @@ begin
end;
// Test Byte Order Mark
if ByteOrderMark = UNICODE_BOM then
FileCharset := fcsUnicode
Result := fcsUnicode
else if ByteOrderMark = UNICODE_BOM_SWAPPED then
FileCharset := fcsUnicodeSwapped
Result := fcsUnicodeSwapped
else if Utf8Test = UTF8_BOM then
FileCharset := fcsUtf8
else
FileCharset := fcsAnsi;
end;
Result := fcsUtf8
else begin
{ @note Taken from SynUnicode.pas }
{ If no BOM was found, check for leading/trailing byte sequences,
which are uncommon in usual non UTF-8 encoded text.
NOTE: There is no 100% save way to detect UTF-8 streams. The bigger
MinimumCountOfUTF8Strings, the lower is the probability of
a false positive. On the other hand, a big MinimumCountOfUTF8Strings
makes it unlikely to detect files with only little usage of non
US-ASCII chars, like usual in European languages. }
// if no special characteristics are found it is not UTF-8
Result := fcsAnsi;
// if Stream is nil, let Delphi raise the exception, by accessing Stream,
// to signal an invalid result
// start analysis at actual Stream.Position
BufferSize := Min(MaxBufferSize, Stream.Size - Stream.Position);
if BufferSize > 0 then begin
SetLength(Buffer, BufferSize);
Stream.ReadBuffer(Buffer[0], BufferSize);
Stream.Seek(-BufferSize, soFromCurrent);
FoundUTF8Strings := 0;
i := 0;
while i < BufferSize do begin
if FoundUTF8Strings = MinimumCountOfUTF8Strings then begin
Result := fcsUtf8;
Break;
end;
case Buffer[i] of
$00..$7F: // skip US-ASCII characters as they could belong to various charsets
;
$C2..$DF:
if CountOfTrailingBytes = 1 then
inc(FoundUTF8Strings)
else
Break;
$E0:
begin
inc(i);
if (i < BufferSize) and (Buffer[i] in [$A0..$BF]) and (CountOfTrailingBytes = 1) then
inc(FoundUTF8Strings)
else
Break;
end;
$E1..$EC, $EE..$EF:
if CountOfTrailingBytes = 2 then
inc(FoundUTF8Strings)
else
Break;
$ED:
begin
inc(i);
if (i < BufferSize) and (Buffer[i] in [$80..$9F]) and (CountOfTrailingBytes = 1) then
inc(FoundUTF8Strings)
else
Break;
end;
$F0:
begin
inc(i);
if (i < BufferSize) and (Buffer[i] in [$90..$BF]) and (CountOfTrailingBytes = 2) then
inc(FoundUTF8Strings)
else
Break;
end;
$F1..$F3:
if CountOfTrailingBytes = 3 then
inc(FoundUTF8Strings)
else
Break;
$F4:
begin
inc(i);
if (i < BufferSize) and (Buffer[i] in [$80..$8F]) and (CountOfTrailingBytes = 2) then
inc(FoundUTF8Strings)
else
Break;
end;
$C0, $C1, $F5..$FF: // invalid UTF-8 bytes
Break;
$80..$BF: // trailing bytes are consumed when handling leading bytes,
// any occurence of "orphaned" trailing bytes is invalid UTF-8
Break;
end;
inc(i);
end;
end;
end;
end;
{**
Read a chunk out of a textfile unicode safe by passing a stream and its charset