Add utility for transforming Trino ROW type columns into Ruby hashes (#…

…117)
treasure-data · Sep 19, 2023 · e14251c · e14251c
1 parent ebd9d9e
commit e14251c
Show file tree

Hide file tree

Showing 4 changed files with 323 additions and 6 deletions.
diff --git a/lib/trino/client/column_value_parser.rb b/lib/trino/client/column_value_parser.rb
@@ -0,0 +1,115 @@
+module Trino::Client
+  class ColumnValueParser
+    INSIDE_MATCHING_PARENS_REGEX = /\((?>[^)(]+|\g<0>)*\)/
+
+    attr_reader :name, :type, :scalar_parser
+
+    def initialize(column, scalar_parser = nil)
+      @name = column.name
+      @type = prepare_type_for_parsing(column.type)
+      @scalar_parser = scalar_parser
+    end
+
+    # Public: Parse the value of a row's field by using its column's Trino type.
+    # Trino types can be scalars like VARCHAR and TIMESTAMP or complex types
+    # like ARRAY and ROW. ROW types are treated as objects.
+    # An ARRAY column's type is an array of types as you'd expect. A ROW
+    # column's type is a comma-separated list of space-separated (name, type) tuples.
+    #
+    # data - The value of a row's field. Can be a string, number, an array of those,
+    #        or an arrays of arrays, etc.
+    # dtype - The Trino type string of the column. See above explanation.
+    #
+    # Returns:
+    # - The given value for strings and numbers
+    # - A Time for timestamps
+    # - A Hash of { field1 => value1, field2 => value2, ...etc } for row types
+    # - An array of the above for array types
+    def value(data, dtype = type)
+      # Convert Trino ARRAY elements into Ruby Arrays
+      if starts_with?(dtype, 'array(')
+        return parse_array_element(data, dtype)
+
+      # Convert Trino ROW elements into Ruby Hashes
+      elsif starts_with?(dtype, 'row(')
+        return parse_row_element(data, dtype)
+
+      # If defined, use scalar_parser to convert scalar types
+      elsif !scalar_parser.nil?
+        return scalar_parser.call(data, dtype)
+      end
+
+      # Otherwise, values are returned unaltered
+      data
+    end
+
+    private
+
+    # Private: Remove quotation marks and handle recent versions of
+    # Trino having a 'with time zone' suffix on some fields that breaks
+    # out assumption that types don't have spaces in them.
+    #
+    # Returns a string.
+    def prepare_type_for_parsing(type)
+      type.gsub('"', '').gsub(' with time zone', '_with_time_zone')
+    end
+
+    def parse_array_element(data, dtype)
+      # If the element is empty, return an empty array
+      return [] if blank?(data)
+
+      # Inner data type will be the current dtype with `array(` and `)` chopped off
+      inner_dtype = dtype.match(INSIDE_MATCHING_PARENS_REGEX)[0][1..-2]
+
+      data.map { |inner_data| value(inner_data, inner_dtype) }
+    end
+
+    def parse_row_element(data, dtype)
+      # If the element is empty, return an empty object
+      return {} if blank?(data)
+
+      parsed_row_element = {}
+
+      inner_dtype = dtype.match(INSIDE_MATCHING_PARENS_REGEX)[0][1..-2]
+      elems = inner_dtype.split(' ')
+      num_elems_to_skip = 0
+      field_position = 0
+
+      # Iterate over each datatype of the row and mutate parsed_row_element
+      # to have a key of the field name and value for that field's value.
+      elems.each_with_index do |field, i|
+        # We detected an array or row and are skipping all of the elements within it
+        # since its conversion was handled by calling `value` recursively.
+        if num_elems_to_skip.positive?
+          num_elems_to_skip -= 1
+          next
+        end
+
+        # Field names never have these characters and are never the last element.
+        next if field.include?(',') || field.include?('(') || field.include?(')') || i == elems.length - 1
+
+        type = elems[(i + 1)..].join(' ')
+
+        # If this row has a nested array or row, the type of this field is that array or row's type.
+        if starts_with?(type, 'array(') || starts_with?(type, 'row(')
+          datatype = type.sub(/\(.*/, '')
+          type = "#{datatype}#{type.match(INSIDE_MATCHING_PARENS_REGEX)[0]}"
+          num_elems_to_skip = type.split(' ').length # see above comment about num_elems_to_skip
+        end
+
+        parsed_row_element[field] = value(data[field_position], type)
+        field_position += 1
+      end
+
+      parsed_row_element
+    end
+
+    def blank?(obj)
+      obj.respond_to?(:empty?) ? !!obj.empty? : !obj
+    end
+
+    def starts_with?(str, prefix)
+      prefix.respond_to?(:to_str) && str[0, prefix.length] == prefix
+    end
+  end
+end
diff --git a/lib/trino/client/query.rb b/lib/trino/client/query.rb
@@ -18,6 +18,7 @@ module Trino::Client
   require 'faraday'
   require 'faraday/gzip'
   require 'faraday/follow_redirects'
+  require 'trino/client/column_value_parser'
   require 'trino/client/models'
   require 'trino/client/errors'
   require 'trino/client/faraday_client'
@@ -44,6 +45,19 @@ def self.faraday_client(options)
       Trino::Client.faraday_client(options)
     end
 
+    def self.transform_row(column_value_parsers, row)
+      row_object = {}
+
+      row.each_with_index do |element, i|
+        column = column_value_parsers[i]
+        value = column.value(element)
+
+        row_object[column.name] = value
+      end
+
+      row_object
+    end
+
     def initialize(api)
       @api = api
     end
@@ -86,6 +100,20 @@ def columns
       return @api.current_results.columns
     end
 
+    def column_value_parsers
+      @column_value_parsers ||= columns.map {|column|
+        ColumnValueParser.new(column)
+      }
+    end
+
+    def transform_rows
+      rows.map(&:transform_row)
+    end
+
+    def transform_row(row)
+      self.class.transform_row(column_value_parsers, row)
+    end
+
     def rows
       rows = []
       each_row_chunk {|chunk|

diff --git a/spec/client_spec.rb b/spec/client_spec.rb
@@ -8,15 +8,17 @@
       [
         Models::Column.new(name: 'animal', type: 'string'),
         Models::Column.new(name: 'score', type: 'integer'),
-        Models::Column.new(name: 'name', type: 'string')
+        Models::Column.new(name: 'name', type: 'string'),
+        Models::Column.new(name: 'foods', type: 'array(string string)'),
+        Models::Column.new(name: 'traits', type: 'row(breed string, num_spots integer)')
       ]
     end
 
     it 'multiple rows' do
       rows = [
-        ['dog', 1, 'Lassie'],
-        ['horse', 5, 'Mr. Ed'],
-        ['t-rex', 37, 'Doug']
+        ['dog', 1, 'Lassie', ['kibble', 'peanut butter'], ['spaniel', 2]],
+        ['horse', 5, 'Mr. Ed', ['hay', 'sugar cubes'], ['some horse', 0]],
+        ['t-rex', 37, 'Doug', ['rodents', 'small dinos'], ['dino', 0]]
       ]
       client.stub(:run).and_return([columns, rows])
 
@@ -27,18 +29,61 @@
       expect(rehashed[0]['animal']).to eq 'dog'
       expect(rehashed[0]['score']).to eq 1
       expect(rehashed[0]['name']).to eq 'Lassie'
+      expect(rehashed[0]['foods']).to eq ['kibble', 'peanut butter']
+      expect(rehashed[0]['traits']).to eq ['spaniel', 2]
 
       expect(rehashed[0].values[0]).to eq 'dog'
       expect(rehashed[0].values[1]).to eq 1
       expect(rehashed[0].values[2]).to eq 'Lassie'
+      expect(rehashed[0].values[3]).to eq ['kibble', 'peanut butter']
+      expect(rehashed[0].values[4]).to eq ['spaniel', 2]
 
       expect(rehashed[1]['animal']).to eq 'horse'
       expect(rehashed[1]['score']).to eq 5
       expect(rehashed[1]['name']).to eq 'Mr. Ed'
+      expect(rehashed[1]['foods']).to eq ['hay', 'sugar cubes']
+      expect(rehashed[1]['traits']).to eq ['some horse', 0]
 
       expect(rehashed[1].values[0]).to eq 'horse'
       expect(rehashed[1].values[1]).to eq 5
       expect(rehashed[1].values[2]).to eq 'Mr. Ed'
+      expect(rehashed[1].values[3]).to eq ['hay', 'sugar cubes']
+      expect(rehashed[1].values[4]).to eq ['some horse', 0]
+    end
+
+    it 'transforms rows into Ruby objects' do
+      rows = [
+        ['dog', 1, 'Lassie', ['kibble', 'peanut butter'], ['spaniel', 2]],
+        ['horse', 5, 'Mr. Ed', ['hay', 'sugar cubes'], ['some horse', 0]],
+        ['t-rex', 37, 'Doug', ['rodents', 'small dinos'], ['dino', 0]]
+      ]
+      client.stub(:run).and_return([columns, rows])
+
+      columns, rows = client.run('fake query')
+      column_value_parsers = columns.map { |column| Trino::Client::ColumnValueParser.new(column) }
+      transformed_rows = rows.map { |row| Trino::Client::Query.transform_row(column_value_parsers, row) }
+
+      expect(transformed_rows[0]).to eq({
+        "animal" => "dog",
+        "score" => 1,
+        "name" => "Lassie",
+        "foods" => ["kibble", "peanut butter"],
+        "traits" => {
+          "breed" => "spaniel",
+          "num_spots" => 2,
+        },
+      })
+
+      expect(transformed_rows[1]).to eq({
+        "animal" => "horse",
+        "score" => 5,
+        "name" => "Mr. Ed",
+        "foods" => ["hay", "sugar cubes"],
+        "traits" => {
+          "breed" => "some horse",
+          "num_spots" => 0,
+        },
+      })
     end
 
     it 'empty results' do
@@ -58,17 +103,21 @@
         "animal" => "wrong",
         "score" => "count",
         "name" => nil,
+        "foods" => nil,
+        "traits" => nil
       }]
     end
 
     it 'handles too many result columns' do
-      rows = [['wrong', 'count', 'too', 'much', 'columns']]
+      rows = [['wrong', 'count', 'too', 'too', 'too', 'much', 'columns']]
       client.stub(:run).and_return([columns, rows])
 
       expect(client.run_with_names('fake query')).to eq [{
         "animal" => "wrong",
         "score" => "count",
-        "name" => 'too',
+        "name" => "too",
+        "foods" => "too",
+        "traits" => "too"
       }]
     end
   end

diff --git a/spec/column_value_parser_spec.rb b/spec/column_value_parser_spec.rb
@@ -0,0 +1,125 @@
+require 'spec_helper'
+
+describe Trino::Client::ColumnValueParser do
+  def column_value(data, type, scalar_parser = nil)
+    column = Struct.new(:type, :name).new(type)
+    Trino::Client::ColumnValueParser.new(column, scalar_parser).value(data)
+  end
+
+  it 'parses varchar values' do
+    data = 'a string'
+    type = 'varchar'
+    expected_value = 'a string'
+    expect(column_value(data, type)).to eq(expected_value)
+  end
+
+  it 'converts scalar values if configured to do so' do
+    data = '2022-07-01T14:53:02Z'
+    type = 'timestamp with time zone'
+    scalar_parser = ->(value, _dtype) { Time.parse(value) }
+    expected_value = Time.parse(data)
+    expect(column_value(data, type, scalar_parser)).to eq(expected_value)
+  end
+
+  it 'parses array type values' do
+    data = [1, 2, 3, 4]
+    type = 'array(integer, integer, integer, integer)'
+    expected_value = [1, 2, 3, 4]
+    expect(column_value(data, type)).to eq(expected_value)
+  end
+
+  it 'parses row type values' do
+    data = [
+      'userId',
+      'userLogin',
+      'SKU_FREE',
+      'TYPE_USER',
+      '2022-07-01T14:53:02Z',
+      ''
+    ]
+    type = 'row(id varchar, "name" varchar, plan_sku varchar, type varchar, created_at timestamp with time zone, organization_tenant_name varchar)'
+    expected_value = {
+      'id' => 'userId',
+      'name' => 'userLogin',
+      'plan_sku' => 'SKU_FREE',
+      'type' => 'TYPE_USER',
+      'created_at' => '2022-07-01T14:53:02Z',
+      'organization_tenant_name' => ''
+    }
+    expect(column_value(data, type)).to eq(expected_value)
+  end
+
+  it 'parses an array of row type values' do
+    data = [[
+      'userId',
+      'userLogin',
+      'SKU_FREE',
+      'TYPE_USER',
+      '2022-07-01T14:53:02Z',
+      ''
+    ]]
+    type = 'array(row(id varchar, "name" varchar, plan_sku varchar, type varchar, created_at timestamp with time zone, organization_tenant_name varchar))'
+    expected_value = [{
+      'id' => 'userId',
+      'name' => 'userLogin',
+      'plan_sku' => 'SKU_FREE',
+      'type' => 'TYPE_USER',
+      'created_at' => '2022-07-01T14:53:02Z',
+      'organization_tenant_name' => ''
+    }]
+    expect(column_value(data, type)).to eq(expected_value)
+  end
+
+  it 'parses row type values that have an array in them' do
+    data = [
+      'userId',
+      %w[userLogin1 userLogin2],
+      'value'
+    ]
+    type = 'row(id varchar, logins array(varchar), onemore varchar)'
+    expected_value = {
+      'id' => 'userId',
+      'logins' => %w[userLogin1 userLogin2],
+      'onemore' => 'value'
+    }
+    expect(column_value(data, type)).to eq(expected_value)
+  end
+
+  it 'parses row type values that have a row in them' do
+    data = [
+      'userId',
+      ['userLogin', '2022-07-01T14:53:02Z', 1234],
+      'value'
+    ]
+    type = 'row(id varchar, subobj row(login varchar, created_at timestamp with time zone, id integer), onemore varchar)'
+    expected_value = {
+      'id' => 'userId',
+      'subobj' => {
+        'login' => 'userLogin',
+        'created_at' => '2022-07-01T14:53:02Z',
+        'id' => 1234
+      },
+      'onemore' => 'value'
+    }
+    expect(column_value(data, type)).to eq(expected_value)
+  end
+
+  it 'parses row type values that have nested rows in them' do
+    data = [
+      'userId',
+      ['userLogin', '2022-07-01T14:53:02Z', [1234]],
+      'value'
+    ]
+    type = 'row(id varchar, subobj row(login varchar, created_at timestamp with time zone, id row(subid integer)), onemore varchar)'
+    expected_value = {
+      'id' => 'userId',
+      'subobj' => {
+        'login' => 'userLogin',
+        'created_at' => '2022-07-01T14:53:02Z',
+        'id' => {'subid' => 1234}
+      },
+      'onemore' => 'value'
+    }
+    expect(column_value(data, type)).to eq(expected_value)
+  end
+end