Fix bug #574 "UTF-8 textfiles without BOM are read as ANSI files". Split out character set detection to GetFileCharset and leave TStream creating in OpenTextFile().

2025-08-06 18:24:26 +08:00 · 2008-06-22 17:45:11 +00:00
parent fcdefa1f7a
commit 3a1d54da86
1 changed files with 132 additions and 10 deletions
--- a/source/helpers.pas
+++ b/source/helpers.pas
@ -96,6 +96,7 @@ type
  procedure SetWindowSizeGrip(hWnd: HWND; Enable: boolean);
  procedure SaveUnicodeFile(Filename: String; Text: WideString);
  procedure OpenTextFile(const Filename: String; out Stream: TFileStream; out FileCharset: TFileCharset);
  function GetFileCharset(Stream: TFileStream): TFileCharset;
  function ReadTextfileChunk(Stream: TFileStream; FileCharset: TFileCharset; ChunkSize: Int64 = 0): WideString;
  function ReadTextfile(Filename: String): WideString;
@ -2382,18 +2383,50 @@ end;
  Open a textfile unicode safe and return a stream + its charset
 }
 procedure OpenTextFile(const Filename: String; out Stream: TFileStream; out FileCharset: TFileCharset);
 begin
  Stream := TFileStream.Create(Filename, fmOpenRead or fmShareDenyWrite);
  Stream.Position := 0;
  FileCharset := GetFileCharset(Stream);
 end;
 {**
  Detect a file's character set which can be
  UTF-16 BE with BOM
  UTF-16 LE with BOM
  UTF-8 with or without BOM
  ANSI
  @see http://en.wikipedia.org/wiki/Byte_Order_Mark
 }
 function GetFileCharset(Stream: TFileStream): TFileCharset;
 var
  ByteOrderMark: WideChar;
  BytesRead: Integer;
-  Utf8Test: array[0..2] of AnsiChar;
+  Utf8Test: AnsiString; //array[0..2] of AnsiChar;
  Buffer: array of Byte;
  BufferSize, i, FoundUTF8Strings: Integer;
 const
  UNICODE_BOM = WideChar($FEFF);
  UNICODE_BOM_SWAPPED = WideChar($FFFE);
  UTF8_BOM = AnsiString(#$EF#$BB#$BF);
-begin
+  MinimumCountOfUTF8Strings = 1;
-  Stream := TFileStream.Create(Filename, fmOpenRead or fmShareDenyWrite);
+  MaxBufferSize = $4000;
  Stream.Position := 0;
  // 3 trailing bytes are the maximum in valid UTF-8 streams,
  // so a count of 4 trailing bytes is enough to detect invalid UTF-8 streams
  function CountOfTrailingBytes: Integer;
  begin
    Result := 0;
    inc(i);
    while (i < BufferSize) and (Result < 4) do begin
      if Buffer[i] in [$80..$BF] then
        inc(Result)
      else
        Break;
      inc(i);
    end;
  end;
 begin
  // Byte Order Mark
  ByteOrderMark := #0;
  if (Stream.Size - Stream.Position) >= SizeOf(ByteOrderMark) then begin
@ -2410,15 +2443,104 @@ begin
  end;
  // Test Byte Order Mark
  if ByteOrderMark = UNICODE_BOM then
-    FileCharset := fcsUnicode
+    Result := fcsUnicode
  else if ByteOrderMark = UNICODE_BOM_SWAPPED then
-    FileCharset := fcsUnicodeSwapped
+    Result := fcsUnicodeSwapped
  else if Utf8Test = UTF8_BOM then
-    FileCharset := fcsUtf8
+    Result := fcsUtf8
-  else
+  else begin
-    FileCharset := fcsAnsi;
+    { @note Taken from SynUnicode.pas }
-end;
+    { If no BOM was found, check for leading/trailing byte sequences,
      which are uncommon in usual non UTF-8 encoded text.
      NOTE: There is no 100% save way to detect UTF-8 streams. The bigger
            MinimumCountOfUTF8Strings, the lower is the probability of
            a false positive. On the other hand, a big MinimumCountOfUTF8Strings
            makes it unlikely to detect files with only little usage of non
            US-ASCII chars, like usual in European languages. }
    // if no special characteristics are found it is not UTF-8
    Result := fcsAnsi;
    // if Stream is nil, let Delphi raise the exception, by accessing Stream,
    // to signal an invalid result
    // start analysis at actual Stream.Position
    BufferSize := Min(MaxBufferSize, Stream.Size - Stream.Position);
    if BufferSize > 0 then begin
      SetLength(Buffer, BufferSize);
      Stream.ReadBuffer(Buffer[0], BufferSize);
      Stream.Seek(-BufferSize, soFromCurrent);
      FoundUTF8Strings := 0;
      i := 0;
      while i < BufferSize do begin
        if FoundUTF8Strings = MinimumCountOfUTF8Strings then begin
          Result := fcsUtf8;
          Break;
        end;
        case Buffer[i] of
          $00..$7F: // skip US-ASCII characters as they could belong to various charsets
            ;
          $C2..$DF:
            if CountOfTrailingBytes = 1 then
              inc(FoundUTF8Strings)
            else
              Break;
          $E0:
            begin
              inc(i);
              if (i < BufferSize) and (Buffer[i] in [$A0..$BF]) and (CountOfTrailingBytes = 1) then
                inc(FoundUTF8Strings)
              else
                Break;
            end;
          $E1..$EC, $EE..$EF:
            if CountOfTrailingBytes = 2 then
              inc(FoundUTF8Strings)
            else
              Break;
          $ED:
            begin
              inc(i);
              if (i < BufferSize) and (Buffer[i] in [$80..$9F]) and (CountOfTrailingBytes = 1) then
                inc(FoundUTF8Strings)
              else
                Break;
            end;
          $F0:
            begin
              inc(i);
              if (i < BufferSize) and (Buffer[i] in [$90..$BF]) and (CountOfTrailingBytes = 2) then
                inc(FoundUTF8Strings)
              else
                Break;
            end;
          $F1..$F3:
            if CountOfTrailingBytes = 3 then
              inc(FoundUTF8Strings)
            else
              Break;
          $F4:
            begin
              inc(i);
              if (i < BufferSize) and (Buffer[i] in [$80..$8F]) and (CountOfTrailingBytes = 2) then
                inc(FoundUTF8Strings)
              else
                Break;
            end;
          $C0, $C1, $F5..$FF: // invalid UTF-8 bytes
            Break;
          $80..$BF: // trailing bytes are consumed when handling leading bytes,
                     // any occurence of "orphaned" trailing bytes is invalid UTF-8
            Break;
        end;
        inc(i);
      end;
    end;
  end;
 end;
 {**
  Read a chunk out of a textfile unicode safe by passing a stream and its charset