Fix bug #574 "UTF-8 textfiles without BOM are read as ANSI files". Split out character set detection to GetFileCharset and leave TStream creating in OpenTextFile().

2025-08-06 18:24:26 +08:00 · 2008-06-22 17:45:11 +00:00
parent fcdefa1f7a
commit 3a1d54da86
1 changed files with 132 additions and 10 deletions
--- a/source/helpers.pas
+++ b/source/helpers.pas
@ -96,6 +96,7 @@ type
  procedure SetWindowSizeGrip(hWnd: HWND; Enable: boolean);
  procedure SaveUnicodeFile(Filename: String; Text: WideString);
  procedure OpenTextFile(const Filename: String; out Stream: TFileStream; out FileCharset: TFileCharset);
+  function GetFileCharset(Stream: TFileStream): TFileCharset;
  function ReadTextfileChunk(Stream: TFileStream; FileCharset: TFileCharset; ChunkSize: Int64 = 0): WideString;
  function ReadTextfile(Filename: String): WideString;

@ -2382,18 +2383,50 @@ end;
  Open a textfile unicode safe and return a stream + its charset
 }
 procedure OpenTextFile(const Filename: String; out Stream: TFileStream; out FileCharset: TFileCharset);
+begin
+  Stream := TFileStream.Create(Filename, fmOpenRead or fmShareDenyWrite);
+  Stream.Position := 0;
+  FileCharset := GetFileCharset(Stream);
+end;
+
+
+{**
+  Detect a file's character set which can be
+  UTF-16 BE with BOM
+  UTF-16 LE with BOM
+  UTF-8 with or without BOM
+  ANSI
+  @see http://en.wikipedia.org/wiki/Byte_Order_Mark
+}
+function GetFileCharset(Stream: TFileStream): TFileCharset;
 var
  ByteOrderMark: WideChar;
  BytesRead: Integer;
-  Utf8Test: array[0..2] of AnsiChar;
+  Utf8Test: AnsiString; //array[0..2] of AnsiChar;
+  Buffer: array of Byte;
+  BufferSize, i, FoundUTF8Strings: Integer;
 const
  UNICODE_BOM = WideChar($FEFF);
  UNICODE_BOM_SWAPPED = WideChar($FFFE);
  UTF8_BOM = AnsiString(#$EF#$BB#$BF);
-begin
-  Stream := TFileStream.Create(Filename, fmOpenRead or fmShareDenyWrite);
-  Stream.Position := 0;
+  MinimumCountOfUTF8Strings = 1;
+  MaxBufferSize = $4000;

+  // 3 trailing bytes are the maximum in valid UTF-8 streams,
+  // so a count of 4 trailing bytes is enough to detect invalid UTF-8 streams
+  function CountOfTrailingBytes: Integer;
+  begin
+    Result := 0;
+    inc(i);
+    while (i < BufferSize) and (Result < 4) do begin
+      if Buffer[i] in [$80..$BF] then
+        inc(Result)
+      else
+        Break;
+      inc(i);
+    end;
+  end;
+begin
  // Byte Order Mark
  ByteOrderMark := #0;
  if (Stream.Size - Stream.Position) >= SizeOf(ByteOrderMark) then begin
@ -2410,15 +2443,104 @@ begin
  end;
  // Test Byte Order Mark
  if ByteOrderMark = UNICODE_BOM then
-    FileCharset := fcsUnicode
+    Result := fcsUnicode
  else if ByteOrderMark = UNICODE_BOM_SWAPPED then
-    FileCharset := fcsUnicodeSwapped
+    Result := fcsUnicodeSwapped
  else if Utf8Test = UTF8_BOM then
-    FileCharset := fcsUtf8
-  else
-    FileCharset := fcsAnsi;
-end;
+    Result := fcsUtf8
+  else begin
+    { @note Taken from SynUnicode.pas }
+    { If no BOM was found, check for leading/trailing byte sequences,
+      which are uncommon in usual non UTF-8 encoded text.

+      NOTE: There is no 100% save way to detect UTF-8 streams. The bigger
+            MinimumCountOfUTF8Strings, the lower is the probability of
+            a false positive. On the other hand, a big MinimumCountOfUTF8Strings
+            makes it unlikely to detect files with only little usage of non
+            US-ASCII chars, like usual in European languages. }
+
+    // if no special characteristics are found it is not UTF-8
+    Result := fcsAnsi;
+
+    // if Stream is nil, let Delphi raise the exception, by accessing Stream,
+    // to signal an invalid result
+
+    // start analysis at actual Stream.Position
+    BufferSize := Min(MaxBufferSize, Stream.Size - Stream.Position);
+
+    if BufferSize > 0 then begin
+      SetLength(Buffer, BufferSize);
+      Stream.ReadBuffer(Buffer[0], BufferSize);
+      Stream.Seek(-BufferSize, soFromCurrent);
+
+      FoundUTF8Strings := 0;
+      i := 0;
+      while i < BufferSize do begin
+        if FoundUTF8Strings = MinimumCountOfUTF8Strings then begin
+          Result := fcsUtf8;
+          Break;
+        end;
+        case Buffer[i] of
+          $00..$7F: // skip US-ASCII characters as they could belong to various charsets
+            ;
+          $C2..$DF:
+            if CountOfTrailingBytes = 1 then
+              inc(FoundUTF8Strings)
+            else
+              Break;
+          $E0:
+            begin
+              inc(i);
+              if (i < BufferSize) and (Buffer[i] in [$A0..$BF]) and (CountOfTrailingBytes = 1) then
+                inc(FoundUTF8Strings)
+              else
+                Break;
+            end;
+          $E1..$EC, $EE..$EF:
+            if CountOfTrailingBytes = 2 then
+              inc(FoundUTF8Strings)
+            else
+              Break;
+          $ED:
+            begin
+              inc(i);
+              if (i < BufferSize) and (Buffer[i] in [$80..$9F]) and (CountOfTrailingBytes = 1) then
+                inc(FoundUTF8Strings)
+              else
+                Break;
+            end;
+          $F0:
+            begin
+              inc(i);
+              if (i < BufferSize) and (Buffer[i] in [$90..$BF]) and (CountOfTrailingBytes = 2) then
+                inc(FoundUTF8Strings)
+              else
+                Break;
+            end;
+          $F1..$F3:
+            if CountOfTrailingBytes = 3 then
+              inc(FoundUTF8Strings)
+            else
+              Break;
+          $F4:
+            begin
+              inc(i);
+              if (i < BufferSize) and (Buffer[i] in [$80..$8F]) and (CountOfTrailingBytes = 2) then
+                inc(FoundUTF8Strings)
+              else
+                Break;
+            end;
+          $C0, $C1, $F5..$FF: // invalid UTF-8 bytes
+            Break;
+          $80..$BF: // trailing bytes are consumed when handling leading bytes,
+                     // any occurence of "orphaned" trailing bytes is invalid UTF-8
+            Break;
+        end;
+        inc(i);
+      end;
+    end;
+  end;
+end;

 {**
  Read a chunk out of a textfile unicode safe by passing a stream and its charset