From a38c70b99ef29857a16903eb5d956dfc9307131f Mon Sep 17 00:00:00 2001 From: Ansgar Becker Date: Wed, 4 Sep 2013 07:53:44 +0000 Subject: [PATCH] * Try a new approach in helpers.OpenTextFile(), helpers.ReadTextfile() and helpers.ReadTextfileChunk(): Based on TStreamReader instead of TFileStream now, so we can finally rely on Delphi internals for detecting a file's encoding. Also, this should fix read errors in some UTF-8 files, e.g. mentioned on http://www.heidisql.com/forum.php?t=13044 * Remove helpers.DetectEncoding(). Use a separate TStreamReader in the only caller to detect the encoding of a selected file * Remove helpers.ScanNulChar() * Remove helpers.RemoveNulChars() --- out/locale/en/LC_MESSAGES/default.po | 16 -- source/const.inc | 2 - source/grideditlinks.pas | 6 +- source/helpers.pas | 274 +++------------------------ source/loaddata.pas | 8 +- source/main.pas | 20 +- source/texteditor.pas | 5 - 7 files changed, 37 insertions(+), 294 deletions(-) diff --git a/out/locale/en/LC_MESSAGES/default.po b/out/locale/en/LC_MESSAGES/default.po index b5a8bd00..3104773f 100644 --- a/out/locale/en/LC_MESSAGES/default.po +++ b/out/locale/en/LC_MESSAGES/default.po @@ -5403,22 +5403,6 @@ msgstr "Snippets" msgid "Index" msgstr "Index" -#. const.inc -msgid "" -"This file contains NUL characters. They have been converted to ASCII spaces " -"(SP)." -msgstr "" -"This file contains NUL characters. They have been converted to ASCII spaces " -"(SP)." - -#. const.inc -msgid "" -"This cell contains NUL characters. They have been converted to ASCII spaces " -"(SP). Press ESC to cancel editing." -msgstr "" -"This cell contains NUL characters. They have been converted to ASCII spaces " -"(SP). Press ESC to cancel editing." - #. const.inc msgid "Unhandled tree node index" msgstr "Unhandled tree node index" diff --git a/source/const.inc b/source/const.inc index ddefbf28..39d51cfd 100644 --- a/source/const.inc +++ b/source/const.inc @@ -81,8 +81,6 @@ const // Modification indicator for TControl.Tag MODIFIEDFLAG = 10; - SContainsNulCharFile = 'This file contains NUL characters. They have been converted to ASCII spaces (SP).'; - SContainsNulCharGrid = 'This cell contains NUL characters. They have been converted to ASCII spaces (SP). Press ESC to cancel editing.'; SUnhandledNodeIndex = 'Unhandled tree node index'; MSG_NOGRIDEDITING = 'Selected columns don''t contain a sufficient set of key columns to allow editing. Please select primary or unique key columns, or just all columns.'; SIdle = 'Idle.'; diff --git a/source/grideditlinks.pas b/source/grideditlinks.pas index 0b7e2327..144391a9 100644 --- a/source/grideditlinks.pas +++ b/source/grideditlinks.pas @@ -1135,11 +1135,7 @@ begin FEdit.Font.Assign(FCellFont); FEdit.Font.Color := clWindowText; FPanel.Color := FCellBackground; - if ScanNulChar(FCellText) then begin - MessageDialog(_(SContainsNulCharGrid), mtInformation, [mbOK]); - FEdit.Text := RemoveNulChars(FCellText); - end else - FEdit.Text := FCellText; + FEdit.Text := FCellText; FEdit.Modified := False; end; diff --git a/source/helpers.pas b/source/helpers.pas index 562d0b2c..c5622a2f 100644 --- a/source/helpers.pas +++ b/source/helpers.pas @@ -254,9 +254,7 @@ type function CleanupNumber(Str: String): String; function IsNumeric(Str: String): Boolean; function esc(Text: String; ProcessJokerChars: Boolean=false; DoQuote: Boolean=True): String; - function ScanNulChar(Text: String): Boolean; function ScanLineBreaks(Text: String): TLineBreaks; - function RemoveNulChars(Text: String): String; function fixNewlines(txt: String): String; function ExtractComment(var SQL: String): String; function GetShellFolder(CSIDL: integer): string; @@ -278,9 +276,8 @@ type function GetTempDir: String; procedure SetWindowSizeGrip(hWnd: HWND; Enable: boolean); procedure SaveUnicodeFile(Filename: String; Text: String); - procedure OpenTextFile(const Filename: String; out Stream: TFileStream; var Encoding: TEncoding); - function DetectEncoding(Stream: TStream): TEncoding; - function ReadTextfileChunk(Stream: TFileStream; Encoding: TEncoding; ChunkSize: Int64 = 0): String; + procedure OpenTextFile(const Filename: String; out Reader: TStreamReader; Encoding: TEncoding); + function ReadTextfileChunk(Reader: TStreamReader; ChunkSize: Integer=0): String; function ReadTextfile(Filename: String; Encoding: TEncoding): String; function ReadBinaryFile(Filename: String; MaxBytes: Int64): AnsiString; procedure StreamToClipboard(Text, HTML: TStream; CreateHTMLHeader: Boolean); @@ -641,27 +638,6 @@ begin end; -{*** - Detect NUL character in a text. - Useful because fx SynEdit cuts of all text after it encounters a NUL. -} -function ScanNulChar(Text: String): boolean; -var - i: integer; -begin - result := false; - for i:=1 to length(Text) do - begin - if Text[i] = #0 then - begin - result := true; - exit; - end; - end; -end; - - - {*** SynEdit removes all newlines and semi-randomly decides a new newline format to use for any text edited. @@ -708,31 +684,6 @@ begin end; - -{*** - Mangle input text so that SynEdit can load it. - - @param string Text to test - @return Boolean -} -function RemoveNulChars(Text: String): String; -var - i: integer; - c: Char; -begin - SetLength(Result, Length(Text)); - if Length(Text) = 0 then Exit; - i := 1; - repeat - c := Text[i]; - if c = #0 then Result[i] := #32 - else Result[i] := c; - i := i + 1; - until i > length(Text); -end; - - - {*** Unify CR's and LF's to CRLF @@ -1253,220 +1204,45 @@ begin end; -{** - Open a textfile unicode safe and return a stream + its charset -} -procedure OpenTextFile(const Filename: String; out Stream: TFileStream; var Encoding: TEncoding); -var - Header: TBytes; - BomLen: Integer; +procedure OpenTextFile(const Filename: String; out Reader: TStreamReader; Encoding: TEncoding); begin - Stream := TFileStream.Create(Filename, fmOpenRead or fmShareDenyNone); - if Encoding = nil then - Encoding := DetectEncoding(Stream); - // If the file contains a BOM, advance the stream's position - BomLen := 0; - if Length(Encoding.GetPreamble) > 0 then begin - SetLength(Header, Length(Encoding.GetPreamble)); - Stream.ReadBuffer(Pointer(Header)^, Length(Header)); - if CompareMem(Header, Encoding.GetPreamble, SizeOf(Header)) then - BomLen := Length(Encoding.GetPreamble); - end; - Stream.Position := BomLen; + // Open a textfile and return a StreamReader, which detects its encoding if not passed by the caller + if Encoding <> nil then + Reader := TStreamReader.Create(Filename, Encoding) + else + Reader := TStreamReader.Create(Filename, True); end; -{** - Detect stream's content encoding by examing first 100k bytes (MaxBufferSize). Result can be: - UTF-16 BE with BOM - UTF-16 LE with BOM - UTF-8 with or without BOM - ANSI - Aimed to work better than WideStrUtils.IsUTF8String() which didn't work in any test case here. - @see http://en.wikipedia.org/wiki/Byte_Order_Mark - Could also do that with TEncoding.GetBufferEncoding, but that relies on the file having a BOM -} -function DetectEncoding(Stream: TStream): TEncoding; -var - ByteOrderMark: Char; - BytesRead: Integer; - Utf8Test: array[0..2] of AnsiChar; - Buffer: array of Byte; - BufferSize, i, FoundUTF8Strings: Integer; -const - UNICODE_BOM = Char($FEFF); - UNICODE_BOM_SWAPPED = Char($FFFE); - UTF8_BOM = AnsiString(#$EF#$BB#$BF); - MinimumCountOfUTF8Strings = 1; - MaxBufferSize = 100000; - - // 3 trailing bytes are the maximum in valid UTF-8 streams, - // so a count of 4 trailing bytes is enough to detect invalid UTF-8 streams - function CountOfTrailingBytes: Integer; - begin - Result := 0; - inc(i); - while (i < BufferSize) and (Result < 4) do begin - if Buffer[i] in [$80..$BF] then - inc(Result) - else - Break; - inc(i); - end; - end; - -begin - // Byte Order Mark - ByteOrderMark := #0; - if (Stream.Size - Stream.Position) >= SizeOf(ByteOrderMark) then begin - BytesRead := Stream.Read(ByteOrderMark, SizeOf(ByteOrderMark)); - if (ByteOrderMark <> UNICODE_BOM) and (ByteOrderMark <> UNICODE_BOM_SWAPPED) then begin - ByteOrderMark := #0; - Stream.Seek(-BytesRead, soFromCurrent); - if (Stream.Size - Stream.Position) >= Length(Utf8Test) * SizeOf(AnsiChar) then begin - BytesRead := Stream.Read(Utf8Test[0], Length(Utf8Test) * SizeOf(AnsiChar)); - if Utf8Test <> UTF8_BOM then - Stream.Seek(-BytesRead, soFromCurrent); - end; - end; - end; - // Test Byte Order Mark - if ByteOrderMark = UNICODE_BOM then - Result := TEncoding.Unicode - else if ByteOrderMark = UNICODE_BOM_SWAPPED then - Result := TEncoding.BigEndianUnicode - else if Utf8Test = UTF8_BOM then - Result := TEncoding.UTF8 - else begin - { @note Taken from SynUnicode.pas } - { If no BOM was found, check for leading/trailing byte sequences, - which are uncommon in usual non UTF-8 encoded text. - - NOTE: There is no 100% save way to detect UTF-8 streams. The bigger - MinimumCountOfUTF8Strings, the lower is the probability of - a false positive. On the other hand, a big MinimumCountOfUTF8Strings - makes it unlikely to detect files with only little usage of non - US-ASCII chars, like usual in European languages. } - - // if no special characteristics are found it is not UTF-8 - Result := TEncoding.Default; - - // start analysis at actual Stream.Position - BufferSize := Min(MaxBufferSize, Stream.Size - Stream.Position); - - if BufferSize > 0 then begin - SetLength(Buffer, BufferSize); - Stream.ReadBuffer(Buffer[0], BufferSize); - Stream.Seek(-BufferSize, soFromCurrent); - - FoundUTF8Strings := 0; - i := 0; - while i < BufferSize do begin - if FoundUTF8Strings = MinimumCountOfUTF8Strings then begin - Result := TEncoding.UTF8; - Break; - end; - case Buffer[i] of - $00..$7F: // skip US-ASCII characters as they could belong to various charsets - ; - $C2..$DF: - if CountOfTrailingBytes = 1 then - inc(FoundUTF8Strings) - else - Break; - $E0: - begin - inc(i); - if (i < BufferSize) and (Buffer[i] in [$A0..$BF]) and (CountOfTrailingBytes = 1) then - inc(FoundUTF8Strings) - else - Break; - end; - $E1..$EC, $EE..$EF: - if CountOfTrailingBytes = 2 then - inc(FoundUTF8Strings) - else - Break; - $ED: - begin - inc(i); - if (i < BufferSize) and (Buffer[i] in [$80..$9F]) and (CountOfTrailingBytes = 1) then - inc(FoundUTF8Strings) - else - Break; - end; - $F0: - begin - inc(i); - if (i < BufferSize) and (Buffer[i] in [$90..$BF]) and (CountOfTrailingBytes = 2) then - inc(FoundUTF8Strings) - else - Break; - end; - $F1..$F3: - if CountOfTrailingBytes = 3 then - inc(FoundUTF8Strings) - else - Break; - $F4: - begin - inc(i); - if (i < BufferSize) and (Buffer[i] in [$80..$8F]) and (CountOfTrailingBytes = 2) then - inc(FoundUTF8Strings) - else - Break; - end; - $C0, $C1, $F5..$FF: // invalid UTF-8 bytes - Break; - $80..$BF: // trailing bytes are consumed when handling leading bytes, - // any occurence of "orphaned" trailing bytes is invalid UTF-8 - Break; - end; - inc(i); - end; - end; - end; -end; - - -{** - Read a chunk out of a textfile unicode safe by passing a stream and its charset -} -function ReadTextfileChunk(Stream: TFileStream; Encoding: TEncoding; ChunkSize: Int64 = 0): String; +function ReadTextfileChunk(Reader: TStreamReader; ChunkSize: Integer = 0): String; var + Buffer: TCharArray; DataLeft: Int64; - LBuffer: TBytes; - SplitCharSize: Integer; begin - // Be sure to read a multiplier of the encodings max byte count per char - SplitCharSize := ChunkSize mod Encoding.GetMaxByteCount(1); - if SplitCharSize > 0 then - Inc(ChunkSize, Encoding.GetMaxByteCount(1)-SplitCharSize); - DataLeft := Stream.Size - Stream.Position; + // Read a chunk or the complete contents out of a textfile, opened by OpenTextFile() + DataLeft := Reader.BaseStream.Size - Reader.BaseStream.Position; if (ChunkSize = 0) or (ChunkSize > DataLeft) then ChunkSize := DataLeft; - SetLength(LBuffer, ChunkSize); - Stream.ReadBuffer(Pointer(LBuffer)^, ChunkSize); - // Now, TEncoding.Convert returns an empty TByte array in files with russion characters - // See http://www.heidisql.com/forum.php?t=13044 - LBuffer := Encoding.Convert(Encoding, TEncoding.Unicode, LBuffer); - if Length(LBuffer) = 0 then - MainForm.LogSQL('Error when converting chunk from encoding '+Encoding.EncodingName+' to '+TEncoding.Unicode.EncodingName+' in '+ExtractFileName(Stream.FileName)+' at position '+FormatByteNumber(Stream.Position)); - Result := TEncoding.Unicode.GetString(LBuffer); + SetLength(Buffer, ChunkSize); + Reader.ReadBlock(Buffer, 0, Length(Buffer)); + if Length(Buffer) > 0 then + SetString(Result, PChar(@Buffer[0]), Length(Buffer)) + else + Result := ''; end; -{** - Read a unicode or ansi file into memory -} + function ReadTextfile(Filename: String; Encoding: TEncoding): String; var - Stream: TFileStream; + Reader: TStreamReader; begin - OpenTextfile(Filename, Stream, Encoding); - Result := ReadTextfileChunk(Stream, Encoding); - Stream.Free; + // Read a text file into memory + OpenTextfile(Filename, Reader, Encoding); + Result := ReadTextfileChunk(Reader); + Reader.Free; end; + function ReadBinaryFile(Filename: String; MaxBytes: Int64): AnsiString; var Stream: TFileStream; diff --git a/source/loaddata.pas b/source/loaddata.pas index 61f2137a..1005e198 100644 --- a/source/loaddata.pas +++ b/source/loaddata.pas @@ -582,7 +582,7 @@ end; procedure Tloaddataform.btnOpenFileClick(Sender: TObject); var Dialog: TOpenTextFileDialog; - TestStream: TFileStream; + TestReader: TStreamReader; begin Dialog := TOpenTextFileDialog.Create(Self); Dialog.Filter := _('CSV files')+' (*.csv)|*.csv|'+_('Text files')+' (*.txt)|*.txt|'+_('All files')+' (*.*)|*.*'; @@ -593,9 +593,9 @@ begin editfilename.Text := Dialog.FileName; Encoding := Mainform.GetEncodingByName(Dialog.Encodings[Dialog.EncodingIndex]); if Encoding = nil then begin - TestStream := TFileStream.Create(Dialog.Filename, fmOpenRead or fmShareDenyNone); - Encoding := DetectEncoding(TestStream); - TestStream.Free; + TestReader := TStreamReader.Create(Dialog.Filename, True); + Encoding := TestReader.CurrentEncoding; + TestReader.Free; end; SelectedCharsetIndex := -1; grpParseMethod.OnClick(Sender); diff --git a/source/main.pas b/source/main.pas index 667c65d9..4aabd978 100644 --- a/source/main.pas +++ b/source/main.pas @@ -3119,7 +3119,7 @@ procedure TMainForm.RunQueryFile(FileName: String; Encoding: TEncoding); var Dialog: IProgressDialog; Dummy: Pointer; - Stream: TFileStream; + Reader: TStreamReader; Lines, LinesRemain, ErrorNotice: String; Filesize, QueryCount, ErrorCount, RowsAffected, Position: Int64; Queries: TSQLBatch; @@ -3129,7 +3129,7 @@ var begin Dialog.SetLine(1, PChar(_('Clean up ...')), False, Dummy); Queries.Free; - Stream.Free; + Reader.Free; Dialog.StopProgressDialog; BringToFront; SetFocus; @@ -3155,14 +3155,14 @@ begin // Start file operations Filesize := _GetFileSize(FileName); - OpenTextfile(FileName, Stream, Encoding); - while Stream.Position < Stream.Size do begin + OpenTextfile(FileName, Reader, Encoding); + while not Reader.EndOfStream do begin if Dialog.HasUserCancelled then Break; // Read lines from SQL file until buffer reaches a limit of some MB // This strategy performs vastly better than looping through each line - Lines := ReadTextfileChunk(Stream, Encoding, 20*SIZE_MB); + Lines := ReadTextfileChunk(Reader, 20*SIZE_MB); // Split buffer into single queries Queries.SQL := LinesRemain + Lines; @@ -3174,12 +3174,12 @@ begin if Dialog.HasUserCancelled then Break; // Last line has to be processed in next loop if end of file is not reached - if (i = Queries.Count-1) and (Stream.Position < Stream.Size) then begin + if (i = Queries.Count-1) and (not Reader.EndOfStream) then begin LinesRemain := Queries[i].SQL; Break; end; Inc(QueryCount); - Position := Position + Encoding.GetByteCount(Queries[i].SQL); + Position := Position + Reader.CurrentEncoding.GetByteCount(Queries[i].SQL); if ErrorCount > 0 then ErrorNotice := '(' + FormatNumber(ErrorCount) + ' ' + _('Errors') + ')'; Dialog.SetLine(1, @@ -11341,12 +11341,6 @@ begin if Pos(DirnameSnippets, Filename) = 0 then MainForm.AddOrRemoveFromQueryLoadHistory(Filename, True, True); Memo.UndoList.AddGroupBreak; - - if ScanNulChar(Content) then begin - Content := RemoveNulChars(Content); - MessageDialog(_(SContainsNulCharFile), mtInformation, [mbOK]); - end; - Memo.BeginUpdate; LineBreaks := ScanLineBreaks(Content); if ReplaceContent then begin diff --git a/source/texteditor.pas b/source/texteditor.pas index c2e77d03..0ced4a94 100644 --- a/source/texteditor.pas +++ b/source/texteditor.pas @@ -111,11 +111,6 @@ begin if LB <> '' then text := StringReplace(text, LB, CRLF, [rfReplaceAll]); - if ScanNulChar(text) then begin - MessageDialog(_(SContainsNulCharGrid), mtInformation, [mbOK]); - text := RemoveNulChars(text); - end; - // TODO: Find out why the Delphi IDE insists hinting that this // property is ANSI when it is in fact a WideString. memoText.Text := text;