diff --git a/source/helpers.pas b/source/helpers.pas index 463925c6..fb36ec70 100644 --- a/source/helpers.pas +++ b/source/helpers.pas @@ -96,6 +96,7 @@ type procedure SetWindowSizeGrip(hWnd: HWND; Enable: boolean); procedure SaveUnicodeFile(Filename: String; Text: WideString); procedure OpenTextFile(const Filename: String; out Stream: TFileStream; out FileCharset: TFileCharset); + function GetFileCharset(Stream: TFileStream): TFileCharset; function ReadTextfileChunk(Stream: TFileStream; FileCharset: TFileCharset; ChunkSize: Int64 = 0): WideString; function ReadTextfile(Filename: String): WideString; @@ -2382,18 +2383,50 @@ end; Open a textfile unicode safe and return a stream + its charset } procedure OpenTextFile(const Filename: String; out Stream: TFileStream; out FileCharset: TFileCharset); +begin + Stream := TFileStream.Create(Filename, fmOpenRead or fmShareDenyWrite); + Stream.Position := 0; + FileCharset := GetFileCharset(Stream); +end; + + +{** + Detect a file's character set which can be + UTF-16 BE with BOM + UTF-16 LE with BOM + UTF-8 with or without BOM + ANSI + @see http://en.wikipedia.org/wiki/Byte_Order_Mark +} +function GetFileCharset(Stream: TFileStream): TFileCharset; var ByteOrderMark: WideChar; BytesRead: Integer; - Utf8Test: array[0..2] of AnsiChar; + Utf8Test: AnsiString; //array[0..2] of AnsiChar; + Buffer: array of Byte; + BufferSize, i, FoundUTF8Strings: Integer; const UNICODE_BOM = WideChar($FEFF); UNICODE_BOM_SWAPPED = WideChar($FFFE); UTF8_BOM = AnsiString(#$EF#$BB#$BF); -begin - Stream := TFileStream.Create(Filename, fmOpenRead or fmShareDenyWrite); - Stream.Position := 0; + MinimumCountOfUTF8Strings = 1; + MaxBufferSize = $4000; + // 3 trailing bytes are the maximum in valid UTF-8 streams, + // so a count of 4 trailing bytes is enough to detect invalid UTF-8 streams + function CountOfTrailingBytes: Integer; + begin + Result := 0; + inc(i); + while (i < BufferSize) and (Result < 4) do begin + if Buffer[i] in [$80..$BF] then + inc(Result) + else + Break; + inc(i); + end; + end; +begin // Byte Order Mark ByteOrderMark := #0; if (Stream.Size - Stream.Position) >= SizeOf(ByteOrderMark) then begin @@ -2410,15 +2443,104 @@ begin end; // Test Byte Order Mark if ByteOrderMark = UNICODE_BOM then - FileCharset := fcsUnicode + Result := fcsUnicode else if ByteOrderMark = UNICODE_BOM_SWAPPED then - FileCharset := fcsUnicodeSwapped + Result := fcsUnicodeSwapped else if Utf8Test = UTF8_BOM then - FileCharset := fcsUtf8 - else - FileCharset := fcsAnsi; -end; + Result := fcsUtf8 + else begin + { @note Taken from SynUnicode.pas } + { If no BOM was found, check for leading/trailing byte sequences, + which are uncommon in usual non UTF-8 encoded text. + NOTE: There is no 100% save way to detect UTF-8 streams. The bigger + MinimumCountOfUTF8Strings, the lower is the probability of + a false positive. On the other hand, a big MinimumCountOfUTF8Strings + makes it unlikely to detect files with only little usage of non + US-ASCII chars, like usual in European languages. } + + // if no special characteristics are found it is not UTF-8 + Result := fcsAnsi; + + // if Stream is nil, let Delphi raise the exception, by accessing Stream, + // to signal an invalid result + + // start analysis at actual Stream.Position + BufferSize := Min(MaxBufferSize, Stream.Size - Stream.Position); + + if BufferSize > 0 then begin + SetLength(Buffer, BufferSize); + Stream.ReadBuffer(Buffer[0], BufferSize); + Stream.Seek(-BufferSize, soFromCurrent); + + FoundUTF8Strings := 0; + i := 0; + while i < BufferSize do begin + if FoundUTF8Strings = MinimumCountOfUTF8Strings then begin + Result := fcsUtf8; + Break; + end; + case Buffer[i] of + $00..$7F: // skip US-ASCII characters as they could belong to various charsets + ; + $C2..$DF: + if CountOfTrailingBytes = 1 then + inc(FoundUTF8Strings) + else + Break; + $E0: + begin + inc(i); + if (i < BufferSize) and (Buffer[i] in [$A0..$BF]) and (CountOfTrailingBytes = 1) then + inc(FoundUTF8Strings) + else + Break; + end; + $E1..$EC, $EE..$EF: + if CountOfTrailingBytes = 2 then + inc(FoundUTF8Strings) + else + Break; + $ED: + begin + inc(i); + if (i < BufferSize) and (Buffer[i] in [$80..$9F]) and (CountOfTrailingBytes = 1) then + inc(FoundUTF8Strings) + else + Break; + end; + $F0: + begin + inc(i); + if (i < BufferSize) and (Buffer[i] in [$90..$BF]) and (CountOfTrailingBytes = 2) then + inc(FoundUTF8Strings) + else + Break; + end; + $F1..$F3: + if CountOfTrailingBytes = 3 then + inc(FoundUTF8Strings) + else + Break; + $F4: + begin + inc(i); + if (i < BufferSize) and (Buffer[i] in [$80..$8F]) and (CountOfTrailingBytes = 2) then + inc(FoundUTF8Strings) + else + Break; + end; + $C0, $C1, $F5..$FF: // invalid UTF-8 bytes + Break; + $80..$BF: // trailing bytes are consumed when handling leading bytes, + // any occurence of "orphaned" trailing bytes is invalid UTF-8 + Break; + end; + inc(i); + end; + end; + end; +end; {** Read a chunk out of a textfile unicode safe by passing a stream and its charset