Skip to content

Commit

Permalink
Merge pull request #131 from mraspaud/fix-tbm-decoding
Browse files Browse the repository at this point in the history
Enable decoding of tbm dataset name
  • Loading branch information
mraspaud authored Aug 28, 2024
2 parents a3cb9a8 + 6e8a726 commit 60f1849
Show file tree
Hide file tree
Showing 3 changed files with 45 additions and 32 deletions.
31 changes: 19 additions & 12 deletions pygac/pod_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,20 +38,20 @@

import datetime
import logging

try:
from enum import IntFlag
except ImportError:
# python version < 3.6, use a simple object without nice representation
IntFlag = object

import numpy as np

from pyorbital.geoloc_instrument_definitions import avhrr_gac
from pyorbital.geoloc import compute_pixels, get_lonlatalt
from pyorbital.geoloc_instrument_definitions import avhrr_gac

from pygac.clock_offsets_converter import get_offsets
from pygac.correct_tsm_issue import TSM_AFFECTED_INTERVALS_POD, get_tsm_idx
from pygac.reader import Reader, ReaderError, NoTLEData
from pygac.reader import DecodingError, NoTLEData, Reader, ReaderError
from pygac.slerp import slerp
from pygac.utils import file_opener

Expand Down Expand Up @@ -322,18 +322,13 @@ def read_header(cls, filename, fileobj=None, header_date="auto"):
fd_.read(tbm_header.itemsize),
dtype=tbm_header, count=1)
try:
data_set_name = _tbm_head['data_set_name'].decode()
except UnicodeDecodeError:
data_set_name = '---'
allowed_empty = (42*b'\x00' + b' ')
if (cls.data_set_pattern.match(data_set_name)
or (_tbm_head['data_set_name'] == allowed_empty)):
tbm_head = _tbm_head.copy()
tbm_head = cls._validate_tbm_header(_tbm_head)
tbm_offset = tbm_header.itemsize
else:
fd_.seek(0)
except DecodingError:
tbm_head = None
tbm_offset = 0

fd_.seek(tbm_offset, 0)
header = cls.choose_header_based_on_timestamp(header_date, fd_)
fd_.seek(tbm_offset, 0)
# need to copy frombuffer to have write access on head
Expand All @@ -344,6 +339,18 @@ def read_header(cls, filename, fileobj=None, header_date="auto"):
cls._validate_header(head)
return tbm_head, head

@classmethod
def _validate_tbm_header(cls, potential_tbm_header):
data_set_name = potential_tbm_header['data_set_name']
allowed_empty = (42*b'\x00' + b' ')
if data_set_name == allowed_empty:
return potential_tbm_header.copy()

# This will raise a DecodingError if the data_set_name is not valid.
cls._decode_data_set_name(data_set_name)
return potential_tbm_header.copy()


@classmethod
def choose_header_based_on_timestamp(cls, header_date, fd_):
"""Choose the header dtype based on the timestamp."""
Expand Down
28 changes: 17 additions & 11 deletions pygac/reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,16 +209,10 @@ def _correct_data_set_name(cls, header, filename):
filename (str): path to file
"""
filename = str(filename)
for encoding in "utf-8", "cp500":
data_set_name = header['data_set_name']
try:
data_set_name = cls._decode_data_set_name(data_set_name, encoding)
except DecodingError as err:
LOG.debug(str(err))
else:
header["data_set_name"] = data_set_name
break
else:
data_set_name = header['data_set_name']
try:
header["data_set_name"] = cls._decode_data_set_name(data_set_name)
except DecodingError:
LOG.debug(f'The data_set_name in header {header["data_set_name"]} does not match.'
' Use filename instead.')
match = cls.data_set_pattern.search(filename)
Expand All @@ -232,7 +226,19 @@ def _correct_data_set_name(cls, header, filename):
return header

@classmethod
def _decode_data_set_name(cls, data_set_name, encoding):
def _decode_data_set_name(cls, data_set_name):
for encoding in "utf-8", "cp500":
try:
data_set_name = cls._decode_data_set_name_for_encoding(data_set_name, encoding)
except DecodingError as err:
LOG.debug(str(err))
else:
return data_set_name
else:
raise DecodingError("Could not reliably decode the dataset name.")

@classmethod
def _decode_data_set_name_for_encoding(cls, data_set_name, encoding):
data_set_name = data_set_name.decode(encoding, errors='ignore')
if not cls.data_set_pattern.match(data_set_name):
raise DecodingError(f'The data_set_name in header {data_set_name} '
Expand Down
18 changes: 9 additions & 9 deletions pygac/tests/test_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,20 +24,20 @@
import os
import sys
import unittest
import pytest

from unittest import mock

import numpy as np
import numpy.testing
from pygac.gac_reader import GACReader, ReaderError
from pygac.lac_reader import LACReader
from pygac.pod_reader import POD_QualityIndicator
import pytest

from pygac.gac_pod import scanline
from pygac.reader import NoTLEData
from pygac.gac_reader import GACReader, ReaderError
from pygac.lac_pod import LACPODReader

from pygac.pod_reader import tbm_header as tbm_header_dtype, header3
from pygac.lac_pod import scanline as lacpod_scanline
from pygac.lac_reader import LACReader
from pygac.pod_reader import POD_QualityIndicator, header3
from pygac.pod_reader import tbm_header as tbm_header_dtype
from pygac.reader import NoTLEData


class TestPath(os.PathLike):
Expand Down Expand Up @@ -688,7 +688,7 @@ def pod_file_with_tbm_header(tmp_path):
number_of_scans = 3

tbm_header = np.zeros(1, dtype=tbm_header_dtype)
tbm_header["data_set_name"] = b"BRN.HRPT.NJ.D00322.S0334.E0319.B3031919.BL "
tbm_header["data_set_name"] = "BRN.HRPT.NJ.D00322.S0334.E0319.B3031919.BL\x80\x80".encode("cp500")
tbm_header["select_flag"] = b"S"
tbm_header["beginning_latitude"] = b"+77"
tbm_header["ending_latitude"] = b"+22"
Expand Down

0 comments on commit 60f1849

Please sign in to comment.