-
Notifications
You must be signed in to change notification settings - Fork 36
/
useragent.rb
238 lines (201 loc) · 9.52 KB
/
useragent.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
# encoding: utf-8
require "logstash-filter-useragent_jars"
require "logstash/filters/base"
require "logstash/namespace"
require 'logstash/plugin_mixins/ecs_compatibility_support'
# Parse user agent strings into structured data based on BrowserScope data
#
# UserAgent filter, adds information about user agent like family, operating
# system, version, and device
#
# Logstash releases ship with the regexes.yaml database made available from
# ua-parser with an Apache 2.0 license. For more details on ua-parser, see
# <https://github.com/tobie/ua-parser/>.
class LogStash::Filters::UserAgent < LogStash::Filters::Base
include LogStash::PluginMixins::ECSCompatibilitySupport(:disabled, :v1, :v8 => :v1)
config_name "useragent"
# The field containing the user agent string. If this field is an
# array, only the first value will be used.
config :source, :validate => :string, :required => true
# The name of the field to assign user agent data into.
#
# If not specified user agent data will be stored in the root of the event.
config :target, :validate => :string # default [user_agent] in ECS mode
# `regexes.yaml` file to use
#
# If not specified, this will default to the `regexes.yaml` that ships
# with logstash.
#
# You can find the latest version of this here:
# <https://github.com/ua-parser/uap-core/blob/master/regexes.yaml>
config :regexes, :validate => :string
# A string to prepend to all of the extracted keys
config :prefix, :validate => :string, :default => '' # not supported in ECS mode
# UA parsing is surprisingly expensive. This filter uses an LRU cache to take advantage of the fact that
# user agents are often found adjacent to one another in log files and rarely have a random distribution.
# The higher you set this the more likely an item is to be in the cache and the faster this filter will run.
# However, if you set this too high you can use more memory than desired.
#
# Experiment with different values for this option to find the best performance for your dataset.
#
# This MUST be set to a value > 0. There is really no reason to not want this behavior, the overhead is minimal
# and the speed gains are large.
#
# It is important to note that this config value is global. That is to say all instances of the user agent filter
# share the same cache. The last declared cache size will 'win'. The reason for this is that there would be no benefit
# to having multiple caches for different instances at different points in the pipeline, that would just increase the
# number of cache misses and waste memory.
config :lru_cache_size, :validate => :number, :default => 100_000
def initialize(*params)
super
# make @target in the format [field name] if defined, i.e. surrounded by brackets
target = @target || ecs_select[disabled: '', v1: '[user_agent]']
target = "[#{@target}]" if !target.empty? && target !~ /^\[[^\[\]]+\]$/
@name_field = ecs_select[disabled: "[#{@prefix}name]", v1: '[name]']
@name_field = "#{target}#{@name_field}"
@device_name_field = ecs_select[disabled: "[#{@prefix}device]", v1: '[device][name]']
@device_name_field = "#{target}#{@device_name_field}"
@version_field = ecs_select[disabled: "[#{@prefix}version]", v1: '[version]']
@version_field = "#{target}#{@version_field}"
@major_field = ecs_select[disabled: "#{target}[#{@prefix}major]", v1: "[@metadata][filter][user_agent][version][major]"]
@minor_field = ecs_select[disabled: "#{target}[#{@prefix}minor]", v1: "[@metadata][filter][user_agent][version][minor]"]
@patch_field = ecs_select[disabled: "#{target}[#{@prefix}patch]", v1: "[@metadata][filter][user_agent][version][patch]"]
@os_full_name_field = ecs_select[disabled: "[#{@prefix}os_full]", v1: '[os][full]'] # did not exist in legacy prior to ECS-ification
@os_full_name_field = "#{target}#{@os_full_name_field}"
@os_name_field = ecs_select[disabled: "[#{@prefix}os_name]", v1: '[os][name]']
@os_name_field = "#{target}#{@os_name_field}"
@legacy_os_field = ecs_select[disabled: "#{target}[#{@prefix}os]", v1: nil] # same as [os_name] in legacy mode
@os_version_field = ecs_select[disabled: "[#{@prefix}os_version]", v1: '[os][version]']
@os_version_field = "#{target}#{@os_version_field}"
@os_major_field = ecs_select[disabled: "#{target}[#{@prefix}os_major]", v1: "[@metadata][filter][user_agent][os][version][major]"]
@os_minor_field = ecs_select[disabled: "#{target}[#{@prefix}os_minor]", v1: "[@metadata][filter][user_agent][os][version][minor]"]
@os_patch_field = ecs_select[disabled: "#{target}[#{@prefix}os_patch]", v1: "[@metadata][filter][user_agent][os][version][patch]"]
# NOTE: unfortunately we can not reliably provide `user_agent.original` since the patterns do not
# reliably give back the matched group and they support the UA string prefixed and/or suffixed
end
def register
if ecs_compatibility != :disabled && @prefix && !@prefix.empty?
@logger.warn "Field prefix isn't supported in ECS compatibility mode, please remove `prefix => #{@prefix.inspect}`"
end
if @regexes.nil?
@parser = org.logstash.uaparser.CachingParser.new(lru_cache_size)
else
@logger.debug("Using user agent regexes", :regexes => @regexes)
@parser = org.logstash.uaparser.CachingParser.new(@regexes, lru_cache_size)
end
end
def filter(event)
useragent = event.get(@source)
useragent = useragent.first if useragent.is_a?(Array)
return if useragent.nil? || useragent.empty?
begin
ua_data = lookup_useragent(useragent)
rescue => e
@logger.error("Unknown error while parsing user agent data",
:exception => e.class, :message => e.message, :backtrace => e.backtrace,
:field => @source, :event => event.to_hash)
return
end
return unless ua_data
event.remove(@source) if @target == @source
set_fields(event, useragent, ua_data)
filter_matched(event)
end
private
def lookup_useragent(useragent)
@parser.parse(useragent)
end
def set_fields(event, ua_source, ua_data)
# UserAgentParser strings are US-ASCII
ua = ua_data.userAgent
event.set(@name_field, duped_string(ua.family))
event.set(@device_name_field, duped_string(ua_data.device)) if ua_data.device
event.set(@major_field, duped_string(ua.major)) if ua.major
event.set(@minor_field, duped_string(ua.minor)) if ua.minor
event.set(@patch_field, duped_string(ua.patch)) if ua.patch
set_version(event, ua_source, ua) # UA version string e.g. "89.0.4389.90"
os = ua_data.os
if os
# os.major, os.minor, ... are all strings
event.set(@os_major_field, duped_string(os.major)) if os.major # e.g. 'Vista' or '10'
event.set(@os_minor_field, duped_string(os.minor)) if os.minor
event.set(@os_patch_field, duped_string(os.patch)) if os.patch
os_version = build_os_version(os)
event.set(@os_version_field, os_version) if os_version
os_name = os.family
if os_name
os_name = duped_string(os_name)
event.set(@os_name_field, os_name)
event.set(@legacy_os_field, os_name.dup) if @legacy_os_field
os_full_name = os_name.dup
os_full_name << ' ' << os_version if os_version
event.set(@os_full_name_field, os_full_name)
end
end
end
# reconstruct and set the User-Agent version string
def set_version(event, ua_source, ua)
if @version_field && ua.major
# only Chrome has all 4 segments, while Firefox only uses major.minor
version = duped_string(ua.major)
if ua.minor
version << '.' << ua.minor
if ua.patch
version << '.' << ua.patch
if ua.patchMinor
version << '.' << ua.patchMinor
else
adjusted_version = check_and_adjust_version(ua_source, version)
version = adjusted_version if adjusted_version
end
end
end
event.set(@version_field, version)
end
end
def check_and_adjust_version(ua_source, version)
# only set OS version if it's not 'interpreted' (contained in UA string)
return nil if !version || (i = ua_source.index(version)).nil?
i += version.size
# complete version when patchMinor is not matched but still there
if ua_source[i] == '.' # we built the version with dots
if patch_minor = ua_source.index(' ', i + 1)
patch_minor = ua_source[i + 1...patch_minor]
if patch_minor.eql? patch_minor.to_i.to_s
version = "#{version}.#{patch_minor}"
end
end
end
version
end
# reconstructs the OS version string
def build_os_version(os)
# NOTE: UA regexes don't always give us the versions back
# they do get "corrected" for various OSes such as:
# - Windows (Windows NT 6.0 => 'Vista')
# - Windows ('Windows NT 6.3' => '8','1')
# - Windows ('Windows NT 10.0' => '10')
# - iOS ('Darwin/15.5' => '9','3','2')
return unless major = os.major
if major.to_i.to_s == major
version, sep = duped_string(major), '.'
else
version, sep = duped_string(major), ' '
end
if os.minor
version << sep << os.minor
if os.patch
version << '.' << os.patch
if os.patchMinor
version << '.' << os.patchMinor
end
end
end
version
end
def duped_string(str)
# Calls in here use #dup because there's potential for later filters to modify these values
# and corrupt the cache. See uap source here for details https://github.com/ua-parser/uap-ruby/tree/master/lib/user_agent_parser
str.dup.force_encoding(Encoding::UTF_8)
end
end