Skip to content

Commit

Permalink
fix: Correct Stream Position after BOM test
Browse files Browse the repository at this point in the history
  • Loading branch information
gcarreno committed Jan 15, 2024
1 parent 6123b5b commit a66ec6c
Show file tree
Hide file tree
Showing 4 changed files with 322 additions and 36 deletions.
63 changes: 50 additions & 13 deletions src/text/opp.text.pas
Original file line number Diff line number Diff line change
Expand Up @@ -9,18 +9,41 @@ interface
, SysUtils
;

const
cBOMUTF8 : String = #$EF#$BB#$BF; // EF BB BF
cBOMUTF16BE : String = #$FE#$FF; //FE FF
cBOMUTF16LE : String = #$FF#$FE; //FF FE
cBOMUTF32BE : String = #$00#$00#$FE#$FF; //00 00 FE FF
cBOMUTF32LE : String = #$00#$00#$FF#$FE; //00 00 FF FE

type
{ TTextFileType }
TTextFileType = (tftUnknown, tftAnsi, tftUTF8, tftUTF16, tftUTF32);
TTextFileType = (tftUnknown, tftAnsi, tftUTF8, tftUTF16BE, tftUTF16LE, tftUTF32BE, tftUTF32LE);

function TextFileTypeToString(const ATextFileType: TTextFileType): String;

resourcestring
rsTextFileTypeUnknown = 'Text File Type Unknown';
rsTextFileTypeAnsi = 'Text File Type Ansi';
rsTextFileTypeUTF8 = 'Text File Type UTF8';
rsTextFileTypeUTF16 = 'Text File Type UTF16';
rsTextFileTypeUTF32 = 'Text File Type UTF32';
rsTextFileTypeUTF16BE = 'Text File Type UTF16 Big Endian';
rsTextFileTypeUTF16LE = 'Text File Type UTF16 Little Endian';
rsTextFileTypeUTF32BE = 'Text File Type UTF32 Big Endian';
rsTextFileTypeUTF32LE = 'Text File Type UTF32 Little Endian';

type
{ TTextBOMType }
TTextBOMType = (tbtUnknown, tbtUTF8, tbtUTF16BE, tbtUTF16LE, tbtUTF32BE, tbtUTF32LE);

function TextBOMTypeToString(const ATextBOMType: TTextBOMType): String;

resourcestring
rsTextBOMTypeUnknown = 'Text BOM Type Unknown';
rsTextBOMTypeUTF8 = 'Text BOM Type UTF8';
rsTextBOMTypeUTF16BE = 'Text BOM Type UTF16 Big Endian';
rsTextBOMTypeUTF16LE = 'Text BOM Type UTF16 Little Endian';
rsTextBOMTypeUTF32BE = 'Text BOM Type UTF32 Big Endian';
rsTextBOMTypeUTF32LE = 'Text BOM Type UTF32 Little Endian';

type
{ TTextCharType }
Expand Down Expand Up @@ -48,22 +71,36 @@ implementation
function TextFileTypeToString(const ATextFileType: TTextFileType): String;
begin
case ATextFileType of
tftUnknown: Result:= rsTextFileTypeUnknown;
tftAnsi: Result:= rsTextFileTypeAnsi;
tftUTF8: Result:= rsTextFileTypeUTF8;
tftUTF16: Result:= rsTextFileTypeUTF16;
tftUTF32: Result:= rsTextFileTypeUTF32;
tftUnknown: Result:= rsTextFileTypeUnknown;
tftAnsi: Result:= rsTextFileTypeAnsi;
tftUTF8: Result:= rsTextFileTypeUTF8;
tftUTF16BE: Result:= rsTextFileTypeUTF16BE;
tftUTF16LE: Result:= rsTextFileTypeUTF16LE;
tftUTF32BE: Result:= rsTextFileTypeUTF32BE;
tftUTF32LE: Result:= rsTextFileTypeUTF32LE;
end;
end;

function TextBOMTypeToString(const ATextBOMType: TTextBOMType): String;
begin
case ATextBOMType of
tbtUnknown: Result:= rsTextBOMTypeUnknown;
tbtUTF8: Result:= rsTextBOMTypeUTF8;
tbtUTF16BE: Result:= rsTextBOMTypeUTF16BE;
tbtUTF16LE: Result:= rsTextBOMTypeUTF16LE;
tbtUTF32BE: Result:= rsTextBOMTypeUTF32BE;
tbtUTF32LE: Result:= rsTextBOMTypeUTF32LE;
end;
end;

function TextCharTypeToString(const ATextCharType: TTextCharType): String;
begin
case ATextCharType of
tctUnknown: Result:= rsTextFileTypeUnknown;
tctAnsi: Result:= rsTextFileTypeAnsi;
tctUTF8: Result:= rsTextFileTypeUTF8;
tctUTF16: Result:= rsTextFileTypeUTF16;
tctUTF32: Result:= rsTextFileTypeUTF32;
tctUnknown: Result:= rsTextCharTypeUnknown;
tctAnsi: Result:= rsTextCharTypeAnsi;
tctUTF8: Result:= rsTextCharTypeUTF8;
tctUTF16: Result:= rsTextCharTypeUTF16;
tctUTF32: Result:= rsTextCharTypeUTF32;
end;
end;

Expand Down
165 changes: 164 additions & 1 deletion src/text/opp.text.sourcefile.pas
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ TTextSourceFile = class(TObject)
FFilename: String;
FSourceFileStream: TFileStream;
FFileType: TTextFileType;
FFileHasBOM: Boolean;

function GetStreamSize: Int64;
protected
Expand All @@ -38,6 +39,8 @@ TTextSourceFile = class(TObject)
read FFileType;
property Size: Int64
read GetStreamSize;
property FileHasBOM: Boolean
read FFileHasBOM;
published
end;

Expand All @@ -50,6 +53,10 @@ implementation
{ TTextSourceFile }

constructor TTextSourceFile.Create(const AFileName: String);
var
buffer: Byte;
bytesread: Int64;
BOMTest: String;
begin
FSourceFileStream:= nil;
if not FileExists(AFileName) then raise ETextSourceFileDoesNotExist.Create(
Expand All @@ -63,7 +70,159 @@ constructor TTextSourceFile.Create(const AFileName: String);
FFilename:= AFileName;
FSourceFileStream:= TFileStream.Create(AFileName, fmOpenRead);
{ #todo 999 -ogcarreno : This needs to be BOM and UTF aware!! }
FFileType:= tftAnsi;
BOMTest:= EmptyStr;

FFileType:= tftUnknown;
FFileHasBOM:= False;
buffer:= 0;

// For UTF32
FSourceFileStream.Position:= 0; // Just in case
BOMTest:= EmptyStr;
if FSourceFileStream.Size >= 4 then
begin
bytesread:= FSourceFileStream.Read(buffer, SizeOf(buffer));
if bytesRead = 0 then raise ETextSourceFilePrematureEOF.Create(
Format(
rsETextSourceFilePrematureEOF,
[ FFilename ]
)
);
BOMTest:= BOMTest + Char(buffer);

bytesread:= FSourceFileStream.Read(buffer, SizeOf(buffer));
if bytesRead = 0 then raise ETextSourceFilePrematureEOF.Create(
Format(
rsETextSourceFilePrematureEOF,
[ FFilename ]
)
);
BOMTest:= BOMTest + Char(buffer);

bytesread:= FSourceFileStream.Read(buffer, SizeOf(buffer));
if bytesRead = 0 then raise ETextSourceFilePrematureEOF.Create(
Format(
rsETextSourceFilePrematureEOF,
[ FFilename ]
)
);
BOMTest:= BOMTest + Char(buffer);

bytesread:= FSourceFileStream.Read(buffer, SizeOf(buffer));
if bytesRead = 0 then raise ETextSourceFilePrematureEOF.Create(
Format(
rsETextSourceFilePrematureEOF,
[ FFilename ]
)
);
BOMTest:= BOMTest + Char(buffer);

if BOMTest = cBOMUTF32BE then
begin
FFileType:= tftUTF32BE;
FFileHasBOM:= True;
end
else
if BOMTest = cBOMUTF16LE then
begin
FFileType:= tftUTF32LE;
FFileHasBOM:= True;
end
end;

// For UTF8
FSourceFileStream.Position:= 0; // Just in case
BOMTest:= EmptyStr;
if FSourceFileStream.Size >= 3 then
begin
bytesread:= FSourceFileStream.Read(buffer, SizeOf(buffer));
if bytesRead = 0 then raise ETextSourceFilePrematureEOF.Create(
Format(
rsETextSourceFilePrematureEOF,
[ FFilename ]
)
);
BOMTest:= BOMTest + Char(buffer);

bytesread:= FSourceFileStream.Read(buffer, SizeOf(buffer));
if bytesRead = 0 then raise ETextSourceFilePrematureEOF.Create(
Format(
rsETextSourceFilePrematureEOF,
[ FFilename ]
)
);
BOMTest:= BOMTest + Char(buffer);

bytesread:= FSourceFileStream.Read(buffer, SizeOf(buffer));
if bytesRead = 0 then raise ETextSourceFilePrematureEOF.Create(
Format(
rsETextSourceFilePrematureEOF,
[ FFilename ]
)
);
BOMTest:= BOMTest + Char(buffer);

if Copy(BOMTest, 1, 3) = cBOMUTF8 then
begin
FFileType:= tftUTF8;
FFileHasBOM:= True;
end;
end;

// For UTF16
FSourceFileStream.Position:= 0; // Just in case
BOMTest:= EmptyStr;
if FSourceFileStream.Size >= 2 then
begin
bytesread:= FSourceFileStream.Read(buffer, SizeOf(buffer));
if bytesRead = 0 then raise ETextSourceFilePrematureEOF.Create(
Format(
rsETextSourceFilePrematureEOF,
[ FFilename ]
)
);
BOMTest:= BOMTest + Char(buffer);

bytesread:= FSourceFileStream.Read(buffer, SizeOf(buffer));
if bytesRead = 0 then raise ETextSourceFilePrematureEOF.Create(
Format(
rsETextSourceFilePrematureEOF,
[ FFilename ]
)
);
BOMTest:= BOMTest + Char(buffer);

if Copy(BOMTest, 1, 2) = cBOMUTF16BE then
begin
FFileType:= tftUTF16BE;
FFileHasBOM:= True;
end
else
if Copy(BOMTest, 1, 2) = cBOMUTF16LE then
begin
FFileType:= tftUTF16LE;
FFileHasBOM:= True;
end;
end;

case FFileType of
tftUnknown: begin
{ #todo 999 -ogcarreno : We are assuming this for the time being }
FFileType:= tftUTF8;
FSourceFileStream.Position:= 0;
end;
tftUTF8: begin
FSourceFileStream.Position:= 3;
end;
tftUTF16BE, tftUTF16LE: begin
FSourceFileStream.Position:= 2;
end;
tftUTF32BE, tftUTF32LE: begin
FSourceFileStream.Position:= 4;
end;
end;


end;

destructor TTextSourceFile.Destroy;
Expand Down Expand Up @@ -131,6 +290,10 @@ function TTextSourceFile.GetNextChar: TTextCharacter;
if bytesRead = 0 then raise ETextSourceFilePrematureEOF.Create(rsETextSourceFilePrematureEOF);
Result.Value := Result.Value + Char(buffer);
end;
otherwise
{ #todo 999 -ogcarreno : This is temporary since it does not account for UTF16 nor UTF32 }
Result.&Type:= tctAnsi;
Result.Value := Char(buffer);
end;
end;
end;
Expand Down
1 change: 1 addition & 0 deletions tests/TestObjectPascalParserCLI.lpi
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@
<Unit1>
<Filename Value="tests/opp.tests.pas"/>
<IsPartOfProject Value="True"/>
<UnitName Value="OPP.Tests"/>
</Unit1>
<Unit2>
<Filename Value="text/testobjectpascalparsertextsourcefile.pas"/>
Expand Down
Loading

0 comments on commit a66ec6c

Please sign in to comment.