Rename ReadUnicodeFile to ReadTextfile and split it into 3 different procedures:

- OpenTextfile
- ReadTextfileChunk
- ReadTextfile
Enables us to use the unicode detection in places where we need to read a file chunkwise. (e.g. the runsqlfile unit)
TODO: Fix UTF-8 detection, misdetected as ANSI currently.
This commit is contained in:
Ansgar Becker
2008-06-19 19:08:54 +00:00
parent 4cd5743980
commit 58230fea56
2 changed files with 84 additions and 67 deletions

View File

@ -27,6 +27,8 @@ type
TVTreeDataArray = Array of TVTreeData;
PVTreeDataArray = ^TVTreeDataArray;
TFileCharset = (fcsAnsi, fcsUnicode, fcsUnicodeSwapped, fcsUtf8);
{$I const.inc}
function trimc(s: String; c: Char) : String;
@ -93,7 +95,9 @@ type
function GetDBObjectType( TableStatus: TFields ): Byte;
procedure SetWindowSizeGrip(hWnd: HWND; Enable: boolean);
procedure SaveUnicodeFile(Filename: String; Text: WideString);
function ReadUnicodeFile(Filename: String): WideString;
procedure OpenTextFile(const Filename: String; out Stream: TFileStream; out FileCharset: TFileCharset);
function ReadTextfileChunk(Stream: TFileStream; FileCharset: TFileCharset; ChunkSize: Cardinal = 0): WideString;
function ReadTextfile(Filename: String): WideString;
var
MYSQL_KEYWORDS : TStringList;
@ -2375,86 +2379,99 @@ end;
{**
Read a unicode or ansi file into memory
Open a textfile unicode safe and return a stream + its charset
}
function ReadUnicodeFile(Filename: String): WideString;
procedure OpenTextFile(const Filename: String; out Stream: TFileStream; out FileCharset: TFileCharset);
var
Stream: TFileStream;
ByteOrderMark: WideChar;
BytesRead: Integer;
Utf8Test: array[0..2] of AnsiChar;
DataLeft: Integer;
StreamCharSet: Byte;
SA: AnsiString;
P: PWord;
const
UNICODE_BOM = WideChar($FEFF);
UNICODE_BOM_SWAPPED = WideChar($FFFE);
UTF8_BOM = AnsiString(#$EF#$BB#$BF);
sscAnsi = 0;
sscUnicode = 1;
sscUnicodeSwapped = 2;
sscUtf8 = 3;
begin
Stream := TFileStream.Create(Filename, fmOpenRead or fmShareDenyWrite);
Stream.Position := 0;
// Byte Order Mark
ByteOrderMark := #0;
if (Stream.Size - Stream.Position) >= SizeOf(ByteOrderMark) then begin
BytesRead := Stream.Read(ByteOrderMark, SizeOf(ByteOrderMark));
if (ByteOrderMark <> UNICODE_BOM) and (ByteOrderMark <> UNICODE_BOM_SWAPPED) then begin
ByteOrderMark := #0;
Stream.Seek(-BytesRead, soFromCurrent);
if (Stream.Size - Stream.Position) >= Length(Utf8Test) * SizeOf(AnsiChar) then begin
BytesRead := Stream.Read(Utf8Test[0], Length(Utf8Test) * SizeOf(AnsiChar));
if Utf8Test <> UTF8_BOM then
Stream.Seek(-BytesRead, soFromCurrent);
end;
end;
end;
// Test Byte Order Mark
if ByteOrderMark = UNICODE_BOM then
FileCharset := fcsUnicode
else if ByteOrderMark = UNICODE_BOM_SWAPPED then
FileCharset := fcsUnicodeSwapped
else if Utf8Test = UTF8_BOM then
FileCharset := fcsUtf8
else
FileCharset := fcsAnsi;
end;
{**
Read a chunk out of a textfile unicode safe by passing a stream and its charset
}
function ReadTextfileChunk(Stream: TFileStream; FileCharset: TFileCharset; ChunkSize: Cardinal = 0): WideString;
var
SA: AnsiString;
P: PWord;
begin
if ChunkSize = 0 then
ChunkSize := Stream.Size - Stream.Position;
if (FileCharset in [fcsUnicode, fcsUnicodeSwapped]) then begin
// BOM indicates Unicode text stream
if ChunkSize < SizeOf(WideChar) then
Result := ''
else begin
SetLength(Result, ChunkSize div SizeOf(WideChar));
Stream.Read(PWideChar(Result)^, ChunkSize);
if FileCharset = fcsUnicodeSwapped then begin
P := PWord(PWideChar(Result));
While (P^ <> 0) do begin
P^ := MakeWord(HiByte(P^), LoByte(P^));
Inc(P);
end;
end;
end;
end else if FileCharset = fcsUtf8 then begin
// BOM indicates UTF-8 text stream
SetLength(SA, ChunkSize div SizeOf(AnsiChar));
Stream.Read(PAnsiChar(SA)^, ChunkSize);
Result := UTF8Decode(SA);
end else begin
// without byte order mark it is assumed that we are loading ANSI text
SetLength(SA, ChunkSize div SizeOf(AnsiChar));
Stream.Read(PAnsiChar(SA)^, ChunkSize);
Result := SA;
end;
end;
{**
Read a unicode or ansi file into memory
}
function ReadTextfile(Filename: String): WideString;
var
Stream: TFileStream;
FileCharset: TFileCharset;
begin
try
Stream.Position := 0;
// Byte Order Mark
ByteOrderMark := #0;
if (Stream.Size - Stream.Position) >= SizeOf(ByteOrderMark) then begin
BytesRead := Stream.Read(ByteOrderMark, SizeOf(ByteOrderMark));
if (ByteOrderMark <> UNICODE_BOM) and (ByteOrderMark <> UNICODE_BOM_SWAPPED) then begin
ByteOrderMark := #0;
Stream.Seek(-BytesRead, soFromCurrent);
if (Stream.Size - Stream.Position) >= Length(Utf8Test) * SizeOf(AnsiChar) then begin
BytesRead := Stream.Read(Utf8Test[0], Length(Utf8Test) * SizeOf(AnsiChar));
if Utf8Test <> UTF8_BOM then
Stream.Seek(-BytesRead, soFromCurrent);
end;
end;
end;
// Test Byte Order Mark
if ByteOrderMark = UNICODE_BOM then
StreamCharSet := sscUnicode
else if ByteOrderMark = UNICODE_BOM_SWAPPED then
StreamCharSet := sscUnicodeSwapped
else if Utf8Test = UTF8_BOM then
StreamCharSet := sscUtf8
else
StreamCharSet := sscAnsi;
DataLeft := Stream.Size - Stream.Position;
if (StreamCharSet in [sscUnicode, sscUnicodeSwapped]) then begin
// BOM indicates Unicode text stream
if DataLeft < SizeOf(WideChar) then
Result := ''
else begin
SetLength(Result, DataLeft div SizeOf(WideChar));
Stream.Read(PWideChar(Result)^, DataLeft);
if StreamCharSet = sscUnicodeSwapped then begin
P := PWord(PWideChar(Result));
While (P^ <> 0) do begin
P^ := MakeWord(HiByte(P^), LoByte(P^));
Inc(P);
end;
end;
end;
end else if StreamCharSet = sscUtf8 then begin
// BOM indicates UTF-8 text stream
SetLength(SA, DataLeft div SizeOf(AnsiChar));
Stream.Read(PAnsiChar(SA)^, DataLeft);
Result := UTF8Decode(SA);
end else begin
// without byte order mark it is assumed that we are loading ANSI text
SetLength(SA, DataLeft div SizeOf(AnsiChar));
Stream.Read(PAnsiChar(SA)^, DataLeft);
Result := SA;
end;
OpenTextfile(Filename, Stream, FileCharset);
Result := ReadTextfileChunk(Stream, FileCharset);
finally
Stream.Free;
end;
end;