diff --git a/portable_config/script-opts/uosc.conf b/portable_config/script-opts/uosc.conf
index 4f037d14..82ed9101 100644
--- a/portable_config/script-opts/uosc.conf
+++ b/portable_config/script-opts/uosc.conf
@@ -130,7 +130,7 @@ stream_quality_options=4320,2160,1440,1080,720,480,360,240,144
 
 # （加载文件/导入视频音频轨时）文件浏览器的扩展名过滤列表。默认值覆盖极广，此预设精简为常见的视频和音频格式
 video_types=avi,flv,m2ts,m4v,mkv,mov,mp4,mpeg,mpg,ogv,rm,rmvb,ts,vob,webm,wmv
-audio_types=aac,ape,dsf,dts,flac,m4a,mka,mp3,ogg,opus,wav,wma,wv
+audio_types=aac,ac3,ape,dsf,dts,flac,m4a,mka,mp3,ogg,opus,wav,wma,wv
 image_types=apng,avif,bmp,gif,jfif,jpeg,jpg,jxl,png,svg,tif,tiff,webp
 # （导入字幕时）文件浏览器的扩展名过滤列表。默认值覆盖极广，此预设精简为常见的字幕格式
 subtitle_types=ass,idx,lrc,mks,pgs,sup,srt,ssa,txt,vtt
diff --git a/portable_config/scripts/uosc/elements/Button.lua b/portable_config/scripts/uosc/elements/Button.lua
index 140563cf..ed06568f 100644
--- a/portable_config/scripts/uosc/elements/Button.lua
+++ b/portable_config/scripts/uosc/elements/Button.lua
@@ -23,19 +23,20 @@ function Button:init(id, props)
 end
 
 function Button:on_coordinates() self.font_size = round((self.by - self.ay) * 0.7) end
-function Button:on_mbtn_left_down()
-	-- Don't accept clicks while hidden.
-	if self:get_visibility() <= 0 then return end
+function Button:handle_cursor_down()
 	-- We delay the callback to next tick, otherwise we are risking race
 	-- conditions as we are in the middle of event dispatching.
 	-- For example, handler might add a menu to the end of the element stack, and that
-	-- than picks up this click even we are in right now, and instantly closes itself.
+	-- than picks up this click event we are in right now, and instantly closes itself.
 	mp.add_timeout(0.01, self.on_click)
 end
 
 function Button:render()
 	local visibility = self:get_visibility()
 	if visibility <= 0 then return end
+	if self.proximity_raw == 0 then
+		cursor.on_primary_down = function() self:handle_cursor_down() end
+	end
 
 	local ass = assdraw.ass_new()
 	local is_hover = self.proximity_raw == 0
@@ -54,7 +55,6 @@ function Button:render()
 	-- Tooltip on hover
 	if is_hover and self.tooltip then ass:tooltip(self, self.tooltip) end
 
-
 	-- Badge
 	local icon_clip
 	if self.badge then
diff --git a/portable_config/scripts/uosc/elements/Element.lua b/portable_config/scripts/uosc/elements/Element.lua
index d8f1e22a..bc3125b9 100644
--- a/portable_config/scripts/uosc/elements/Element.lua
+++ b/portable_config/scripts/uosc/elements/Element.lua
@@ -15,7 +15,7 @@ function Element:init(id, props)
 	-- Relative proximity from `0` - mouse outside `proximity_max` range, to `1` - mouse within `proximity_min` range.
 	self.proximity = 0
 	-- Raw proximity in pixels.
-	self.proximity_raw = infinity
+	self.proximity_raw = INFINITY
 	---@type number `0-1` factor to force min visibility. Used for toggling element's permanent visibility.
 	self.min_visibility = 0
 	---@type number `0-1` factor to force a visibility value. Used for flashing, fading out, and other animations
@@ -44,7 +44,7 @@ function Element:destroy()
 	Elements:remove(self)
 end
 
-function Element:reset_proximity() self.proximity, self.proximity_raw = 0, infinity end
+function Element:reset_proximity() self.proximity, self.proximity_raw = 0, INFINITY end
 
 ---@param ax number
 ---@param ay number
diff --git a/portable_config/scripts/uosc/elements/Elements.lua b/portable_config/scripts/uosc/elements/Elements.lua
index f6fba012..186b9e93 100644
--- a/portable_config/scripts/uosc/elements/Elements.lua
+++ b/portable_config/scripts/uosc/elements/Elements.lua
@@ -29,8 +29,6 @@ function Elements:remove(idOrElement)
 end
 
 function Elements:update_proximities()
-	local capture_mbtn_left = false
-	local capture_wheel = false
 	local menu_only = Elements.menu ~= nil
 	local mouse_leave_elements = {}
 	local mouse_enter_elements = {}
@@ -42,26 +40,13 @@ function Elements:update_proximities()
 
 			-- If menu is open, all other elements have to be disabled
 			if menu_only then
-				if element.ignores_menu then
-					capture_mbtn_left = true
-					capture_wheel = true
-					element:update_proximity()
-				else
-					element:reset_proximity()
-				end
+				if element.ignores_menu then element:update_proximity()
+				else element:reset_proximity() end
 			else
 				element:update_proximity()
 			end
 
-			-- Element has global forced key listeners
-			if element.on_global_mbtn_left_down then capture_mbtn_left = true end
-			if element.on_global_wheel_up or element.on_global_wheel_down then capture_wheel = true end
-
 			if element.proximity_raw == 0 then
-				-- Element has local forced key listeners
-				if element.on_mbtn_left_down then capture_mbtn_left = true end
-				if element.on_wheel_up or element.on_wheel_up then capture_wheel = true end
-
 				-- Mouse entered element area
 				if previous_proximity_raw ~= 0 then
 					mouse_enter_elements[#mouse_enter_elements + 1] = element
@@ -75,10 +60,6 @@ function Elements:update_proximities()
 		end
 	end
 
-	-- Enable key group captures requested by elements
-	mp[capture_mbtn_left and 'enable_key_bindings' or 'disable_key_bindings']('mbtn_left')
-	mp[capture_wheel and 'enable_key_bindings' or 'disable_key_bindings']('wheel')
-
 	-- Trigger `mouse_leave` and `mouse_enter` events
 	for _, element in ipairs(mouse_leave_elements) do element:trigger('mouse_leave') end
 	for _, element in ipairs(mouse_enter_elements) do element:trigger('mouse_enter') end
@@ -142,26 +123,4 @@ end
 function Elements:has(id) return self[id] ~= nil end
 function Elements:ipairs() return ipairs(self.itable) end
 
----@param name string Event name.
-function Elements:create_proximity_dispatcher(name)
-	return function(...) self:proximity_trigger(name, ...) end
-end
-
-mp.set_key_bindings({
-	{
-		'mbtn_left',
-		Elements:create_proximity_dispatcher('mbtn_left_up'),
-		function(...)
-			update_mouse_pos(nil, mp.get_property_native('mouse-pos'))
-			Elements:proximity_trigger('mbtn_left_down', ...)
-		end,
-	},
-	{'mbtn_left_dbl', 'ignore'},
-}, 'mbtn_left', 'force')
-
-mp.set_key_bindings({
-	{'wheel_up', Elements:create_proximity_dispatcher('wheel_up')},
-	{'wheel_down', Elements:create_proximity_dispatcher('wheel_down')},
-}, 'wheel', 'force')
-
 return Elements
diff --git a/portable_config/scripts/uosc/elements/Menu.lua b/portable_config/scripts/uosc/elements/Menu.lua
index 8dcb7182..4048b76b 100644
--- a/portable_config/scripts/uosc/elements/Menu.lua
+++ b/portable_config/scripts/uosc/elements/Menu.lua
@@ -4,10 +4,10 @@ local Element = require('elements/Element')
 ---@alias MenuData {type?: string; title?: string; hint?: string; keep_open?: boolean; separator?: boolean; items?: MenuDataItem[]; selected_index?: integer;}
 ---@alias MenuDataItem MenuDataValue|MenuData
 ---@alias MenuDataValue {title?: string; hint?: string; icon?: string; value: any; bold?: boolean; italic?: boolean; muted?: boolean; active?: boolean; keep_open?: boolean; separator?: boolean;}
----@alias MenuOptions {mouse_nav?: boolean; on_open?: fun(); on_close?: fun(); on_back?: fun()}
+---@alias MenuOptions {mouse_nav?: boolean; on_open?: fun(); on_close?: fun(); on_back?: fun(); on_move_item?: fun(from_index: integer, to_index: integer, submenu_path: integer[]); on_delete_item?: fun(index: integer, submenu_path: integer[])}
 
 -- Internal data structure created from `Menu`.
----@alias MenuStack {id?: string; type?: string; title?: string; hint?: string; selected_index?: number; keep_open?: boolean; separator?: boolean; items: MenuStackItem[]; parent_menu?: MenuStack; active?: boolean; width: number; height: number; top: number; scroll_y: number; scroll_height: number; title_width: number; hint_width: number; max_width: number; is_root?: boolean; fling?: Fling}
+---@alias MenuStack {id?: string; type?: string; title?: string; hint?: string; selected_index?: number; keep_open?: boolean; separator?: boolean; items: MenuStackItem[]; parent_menu?: MenuStack; submenu_path: integer[]; active?: boolean; width: number; height: number; top: number; scroll_y: number; scroll_height: number; title_width: number; hint_width: number; max_width: number; is_root?: boolean; fling?: Fling}
 ---@alias MenuStackItem MenuStackValue|MenuStack
 ---@alias MenuStackValue {title?: string; hint?: string; icon?: string; value: any; active?: boolean; bold?: boolean; italic?: boolean; muted?: boolean; keep_open?: boolean; separator?: boolean; title_width: number; hint_width: number}
 ---@alias Fling {y: number, distance: number, time: number, easing: fun(x: number), duration: number, update_cursor?: boolean}
@@ -56,6 +56,7 @@ function Menu:close(immediate, callback)
 			menu.is_closing, menu.stack, menu.current, menu.all, menu.by_id = false, nil, nil, {}, {}
 			menu:disable_key_bindings()
 			Elements:update_proximities()
+			cursor.queue_autohide()
 			if callback then callback() end
 			request_render()
 		end
@@ -135,7 +136,7 @@ end
 function Menu:update(data)
 	self.type = data.type
 
-	local new_root = {is_root = true}
+	local new_root = {is_root = true, submenu_path = {}}
 	local new_all = {}
 	local new_by_id = {}
 	local menus_to_serialize = {{new_root, data}}
@@ -169,6 +170,7 @@ function Menu:update(data)
 			-- Submenu
 			if item_data.items then
 				item.parent_menu = menu
+				item.submenu_path = itable_join(menu.submenu_path, {i})
 				menus_to_serialize[#menus_to_serialize + 1] = {item, item_data}
 			end
 
@@ -210,8 +212,10 @@ function Menu:update_content_dimensions()
 	local hint_opts = {size = self.font_size_hint}
 
 	for _, menu in ipairs(self.all) do
+		title_opts.bold, title_opts.italic = true, false
+		local max_width = text_width(menu.title, title_opts) + 2 * self.item_padding
+
 		-- Estimate width of a widest item
-		local max_width = 0
 		for _, item in ipairs(menu.items) do
 			local icon_width = item.icon and self.font_size or 0
 			item.title_width = text_width(item.title, title_opts)
@@ -223,11 +227,6 @@ function Menu:update_content_dimensions()
 			if estimated_width > max_width then max_width = estimated_width end
 		end
 
-		-- Also check menu title
-		title_opts.bold, title_opts.italic = true, false
-		local menu_title_width = text_width(menu.title, title_opts)
-		if menu_title_width > max_width then max_width = menu_title_width end
-
 		menu.max_width = max_width
 	end
 
@@ -236,8 +235,8 @@ end
 
 function Menu:update_dimensions()
 	-- Coordinates and sizes are of the scrollable area to make
-	-- consuming values in rendering and collisions easier. Title drawn above this, so
-	-- we need to account for that in max_height and ay position.
+	-- consuming values in rendering and collisions easier. Title is rendered
+	-- above it, so we need to account for that in max_height and ay position.
 	local min_width = state.fullormaxed and options.menu_min_width_fullscreen or options.menu_min_width
 
 	for _, menu in ipairs(self.all) do
@@ -252,6 +251,11 @@ function Menu:update_dimensions()
 		self:scroll_to(menu.scroll_y, menu) -- clamps scroll_y to scroll limits
 	end
 
+	self:update_coordinates()
+end
+
+-- Updates element coordinates to match currently open (sub)menu.
+function Menu:update_coordinates()
 	local ax = round((display.width - self.current.width) / 2) + self.offset_x
 	self:set_coordinates(ax, self.current.top, ax + self.current.width, self.current.top + self.current.height)
 end
@@ -359,7 +363,7 @@ end
 function Menu:select_value(value, menu)
 	menu = menu or self.current
 	local index = itable_find(menu.items, function(item) return item.value == value end)
-	self:select_index(index, 5)
+	self:select_index(index)
 end
 
 ---@param menu? MenuStack
@@ -400,16 +404,23 @@ function Menu:activate_one_value(value, menu)
 	self:activate_one_index(index, menu)
 end
 
----@param id string
-function Menu:activate_submenu(id)
-	local submenu = self.by_id[id]
-	if submenu then
-		self.current = submenu
+---@param menu MenuStack One of menus in `self.all`.
+function Menu:activate_menu(menu)
+	if itable_index_of(self.all, menu) then
+		self.current = menu
+		self:update_coordinates()
+		self:reset_navigation()
 		request_render()
 	else
-		msg.error(string.format('Requested submenu id "%s" doesn\'t exist', id))
+		msg.error('Attempt to open a menu not in `self.all` list.')
 	end
-	self:reset_navigation()
+end
+
+---@param id string
+function Menu:activate_submenu(id)
+	local submenu = self.by_id[id]
+	if submenu then self:activate_menu(submenu)
+	else msg.error(string.format('Requested submenu id "%s" doesn\'t exist', id)) end
 end
 
 ---@param index? integer
@@ -456,8 +467,7 @@ function Menu:back()
 
 	if parent then
 		menu.selected_index = nil
-		self.current = parent
-		self:update_dimensions()
+		self:activate_menu(parent)
 		self:tween(self.offset_x - menu.width / 2, 0, function(offset) self:set_offset_x(offset) end)
 		self.opacity = 1 -- in case tween above canceled fade in animation
 	else
@@ -473,11 +483,10 @@ function Menu:open_selected_item(opts)
 		local item = menu.items[menu.selected_index]
 		-- Is submenu
 		if item.items then
-			self.current = item
 			if opts.preselect_submenu_item then
 				item.selected_index = #item.items > 0 and 1 or nil
 			end
-			self:update_dimensions()
+			self:activate_menu(item)
 			self:tween(self.offset_x + menu.width / 2, 0, function(offset) self:set_offset_x(offset) end)
 			self.opacity = 1 -- in case tween above canceled fade in animation
 		else
@@ -491,10 +500,33 @@ function Menu:open_selected_item_soft() self:open_selected_item({keep_open = tru
 function Menu:open_selected_item_preselect() self:open_selected_item({preselect_submenu_item = true}) end
 function Menu:select_item_below_cursor() self.current.selected_index = self:get_item_index_below_cursor() end
 
+---@param index integer
+function Menu:move_selected_item_to(index)
+	local from, callback = self.current.selected_index, self.opts.on_move_item
+	if callback and from and from ~= index and index >= 1 and index <= #self.current.items then
+		callback(from, index, self.current.submenu_path)
+		self.current.selected_index = index
+		request_render()
+	end
+end
+
+function Menu:move_selected_item_up()
+	if self.current.selected_index then self:move_selected_item_to(self.current.selected_index - 1) end
+end
+
+function Menu:move_selected_item_down()
+	if self.current.selected_index then self:move_selected_item_to(self.current.selected_index + 1) end
+end
+
+function Menu:delete_selected_item()
+	local index, callback = self.current.selected_index, self.opts.on_delete_item
+	if callback and index then callback(index, self.current.submenu_path) end
+end
+
 function Menu:on_display() self:update_dimensions() end
 function Menu:on_prop_fullormaxed() self:update_content_dimensions() end
 
-function Menu:on_global_mbtn_left_down()
+function Menu:handle_cursor_down()
 	if self.proximity_raw == 0 then
 		self.drag_data = {{y = cursor.y, time = mp.get_time()}}
 		self.current.fling = nil
@@ -514,7 +546,7 @@ function Menu:fling_distance()
 	return #self.drag_data < 2 and 0 or ((first.y - last.y) / ((first.time - last.time) / 0.03)) * 10
 end
 
-function Menu:on_global_mbtn_left_up()
+function Menu:handle_cursor_up()
 	if self.proximity_raw == 0 and self.drag_data and not self.is_dragging then
 		self:select_item_below_cursor()
 		self:open_selected_item({preselect_submenu_item = false, keep_open = self.modifiers and self.modifiers.shift})
@@ -546,8 +578,8 @@ function Menu:on_global_mouse_move()
 	request_render()
 end
 
-function Menu:on_wheel_up() self:scroll_by(self.scroll_step * -3, nil, {update_cursor = true}) end
-function Menu:on_wheel_down() self:scroll_by(self.scroll_step * 3, nil, {update_cursor = true}) end
+function Menu:handle_wheel_up() self:scroll_by(self.scroll_step * -3, nil, {update_cursor = true}) end
+function Menu:handle_wheel_down() self:scroll_by(self.scroll_step * 3, nil, {update_cursor = true}) end
 
 function Menu:on_pgup()
 	local menu = self.current
@@ -585,6 +617,8 @@ function Menu:enable_key_bindings()
 	-- doesn't support 'repeatable' flag, so we are stuck with this monster.
 	self:add_key_binding('up', 'menu-prev1', self:create_key_action('prev'), 'repeatable')
 	self:add_key_binding('down', 'menu-next1', self:create_key_action('next'), 'repeatable')
+	self:add_key_binding('ctrl+up', 'menu-move-up', self:create_key_action('move_selected_item_up'), 'repeatable')
+	self:add_key_binding('ctrl+down', 'menu-move-down', self:create_key_action('move_selected_item_down'), 'repeatable')
 	self:add_key_binding('left', 'menu-back1', self:create_key_action('back'))
 	self:add_key_binding('right', 'menu-select1', self:create_key_action('open_selected_item_preselect'))
 	self:add_key_binding('shift+right', 'menu-select-soft1',
@@ -608,6 +642,7 @@ function Menu:enable_key_bindings()
 	self:add_key_binding('pgdwn', 'menu-page-down', self:create_key_action('on_pgdwn'), 'repeatable')
 	self:add_key_binding('home', 'menu-home', self:create_key_action('on_home'))
 	self:add_key_binding('end', 'menu-end', self:create_key_action('on_end'))
+	self:add_key_binding('del', 'menu-delete-item', self:create_key_action('delete_selected_item'))
 end
 
 function Menu:disable_key_bindings()
@@ -620,8 +655,8 @@ function Menu:create_modified_mbtn_left_handler(modifiers)
 	return function()
 		self.mouse_nav = true
 		self.modifiers = modifiers
-		self:on_global_mbtn_left_down()
-		self:on_global_mbtn_left_up()
+		self:handle_cursor_down()
+		self:handle_cursor_up()
 		self.modifiers = nil
 	end
 end
@@ -650,6 +685,13 @@ function Menu:render()
 	end
 	if update_cursor then self:select_item_below_cursor() end
 
+	cursor.on_primary_down = function() self:handle_cursor_down() end
+	cursor.on_primary_up = function() self:handle_cursor_up() end
+	if self.proximity_raw == 0 then
+		cursor.on_wheel_down = function() self:handle_wheel_down() end
+		cursor.on_wheel_up = function() self:handle_wheel_up() end
+	end
+
 	local ass = assdraw.ass_new()
 	local opacity = options.menu_opacity * self.opacity
 	local spacing = self.item_padding
diff --git a/portable_config/scripts/uosc/elements/Speed.lua b/portable_config/scripts/uosc/elements/Speed.lua
index 10d89a06..bc4f4405 100644
--- a/portable_config/scripts/uosc/elements/Speed.lua
+++ b/portable_config/scripts/uosc/elements/Speed.lua
@@ -44,9 +44,7 @@ function Speed:speed_step(speed, up)
 	end
 end
 
-function Speed:on_mbtn_left_down()
-	-- Don't accept clicks while hidden.
-	if self:get_visibility() <= 0 then return end
+function Speed:handle_cursor_down()
 	self:tween_stop() -- Stop and cleanup possible ongoing animations
 	self.dragging = {
 		start_time = mp.get_time(),
@@ -87,14 +85,13 @@ function Speed:on_global_mouse_move()
 	end
 end
 
-function Speed:on_mbtn_left_up()
-	-- Reset speed on short clicks
-	if self.dragging and math.abs(self.dragging.distance) < 6 and mp.get_time() - self.dragging.start_time < 0.15 then
-		mp.set_property_native('speed', 1)
+function Speed:handle_cursor_up()
+	if self.proximity_raw == 0 then
+		-- Reset speed on short clicks
+		if self.dragging and math.abs(self.dragging.distance) < 6 and mp.get_time() - self.dragging.start_time < 0.15 then
+			mp.set_property_native('speed', 1)
+		end
 	end
-end
-
-function Speed:on_global_mbtn_left_up()
 	self.dragging = nil
 	request_render()
 end
@@ -104,8 +101,8 @@ function Speed:on_global_mouse_leave()
 	request_render()
 end
 
-function Speed:on_wheel_up() mp.set_property_native('speed', self:speed_step(state.speed, true)) end
-function Speed:on_wheel_down() mp.set_property_native('speed', self:speed_step(state.speed, false)) end
+function Speed:handle_wheel_up() mp.set_property_native('speed', self:speed_step(state.speed, true)) end
+function Speed:handle_wheel_down() mp.set_property_native('speed', self:speed_step(state.speed, false)) end
 
 function Speed:render()
 	local visibility = self:get_visibility()
@@ -113,6 +110,18 @@ function Speed:render()
 
 	if opacity <= 0 then return end
 
+	if self.proximity_raw == 0 then
+		cursor.on_primary_down = function()
+			self:handle_cursor_down()
+			cursor.on_primary_up = function() self:handle_cursor_up() end
+		end
+		cursor.on_wheel_down = function() self:handle_wheel_down() end
+		cursor.on_wheel_up = function() self:handle_wheel_up() end
+	end
+	if self.dragging then
+		cursor.on_primary_up = function() self:handle_cursor_up() end
+	end
+
 	local ass = assdraw.ass_new()
 
 	-- Background
diff --git a/portable_config/scripts/uosc/elements/Timeline.lua b/portable_config/scripts/uosc/elements/Timeline.lua
index 92884def..a4730a28 100644
--- a/portable_config/scripts/uosc/elements/Timeline.lua
+++ b/portable_config/scripts/uosc/elements/Timeline.lua
@@ -6,6 +6,7 @@ local Timeline = class(Element)
 function Timeline:new() return Class.new(self) --[[@as Timeline]] end
 function Timeline:init()
 	Element.init(self, 'timeline')
+	---@type false|{pause: boolean, distance: number, last: {x: number, y: number}}
 	self.pressed = false
 	self.obstructed = false
 	self.size_max = 0
@@ -13,7 +14,8 @@ function Timeline:init()
 	self.size_min_override = options.timeline_start_hidden and 0 or nil
 	self.font_size = 0
 	self.top_border = options.timeline_border
-	self.hovered_chapter = nil
+	self.is_hovered = false
+	self.has_thumbnail = false
 
 	-- Release any dragging when file gets unloaded
 	mp.register_event('end-file', function() self.pressed = false end)
@@ -44,7 +46,7 @@ function Timeline:get_effective_line_width()
 	return state.fullormaxed and options.timeline_line_width_fullscreen or options.timeline_line_width
 end
 
-function Timeline:get_is_hovered() return self.enabled and (self.proximity_raw == 0 or self.hovered_chapter ~= nil) end
+function Timeline:get_is_hovered() return self.enabled and self.is_hovered end
 
 function Timeline:update_dimensions()
 	if state.fullormaxed then
@@ -89,69 +91,48 @@ function Timeline:set_from_cursor(fast)
 		mp.commandv('seek', self:get_time_at_x(cursor.x), fast and 'absolute+keyframes' or 'absolute+exact')
 	end
 end
-function Timeline:clear_thumbnail() mp.commandv('script-message-to', 'thumbfast', 'clear') end
 
-function Timeline:determine_chapter_click_handler()
-	if self.hovered_chapter then
-		if not self.on_global_mbtn_left_down then
-			self.on_global_mbtn_left_down = function()
-				if self.hovered_chapter then mp.commandv('seek', self.hovered_chapter.time, 'absolute+exact') end
-			end
-		end
-	else
-		if self.on_global_mbtn_left_down then
-			self.on_global_mbtn_left_down = nil
-			if self.proximity_raw ~= 0 then self:clear_thumbnail() end
-		end
-	end
+function Timeline:clear_thumbnail()
+	mp.commandv('script-message-to', 'thumbfast', 'clear')
+	self.has_thumbnail = false
 end
 
-function Timeline:on_mbtn_left_down()
-	-- `self.on_global_mbtn_left_down` has precedent
-	if self.on_global_mbtn_left_down then return end
-
-	self.pressed = true
-	self.pressed_pause = state.pause
+function Timeline:handle_cursor_down()
+	self.pressed = {pause = state.pause, distance = 0, last = {x = cursor.x, y = cursor.y}}
 	mp.set_property_native('pause', true)
 	self:set_from_cursor()
+	cursor.on_primary_up = function() self:handle_cursor_up() end
 end
 function Timeline:on_prop_duration() self:decide_enabled() end
 function Timeline:on_prop_time() self:decide_enabled() end
 function Timeline:on_prop_border() self:update_dimensions() end
 function Timeline:on_prop_fullormaxed() self:update_dimensions() end
 function Timeline:on_display() self:update_dimensions() end
-function Timeline:on_mouse_leave()
-	if not self.hovered_chapter then self:clear_thumbnail() end
-end
-function Timeline:on_global_mbtn_left_up()
-	if thumbnail.pause then thumbnail.pause = false end
+function Timeline:handle_cursor_up()
 	if self.pressed then
-		mp.set_property_native('pause', self.pressed_pause)
+		mp.set_property_native('pause', self.pressed.pause)
 		self.pressed = false
 	end
-	self:clear_thumbnail()
 end
 function Timeline:on_global_mouse_leave()
 	self.pressed = false
-	self:clear_thumbnail()
 end
 
 Timeline.seek_timer = mp.add_timeout(0.05, function() Elements.timeline:set_from_cursor() end)
 Timeline.seek_timer:kill()
 function Timeline:on_global_mouse_move()
 	if self.pressed then
-		thumbnail.pause = true
-		self:clear_thumbnail()
+		self.pressed.distance = self.pressed.distance + get_point_to_point_proximity(self.pressed.last, cursor)
+		self.pressed.last.x, self.pressed.last.y = cursor.x, cursor.y
 		if self.width / state.duration < 10 then
 			self:set_from_cursor(true)
 			self.seek_timer:kill()
 			self.seek_timer:resume()
 		else self:set_from_cursor() end
 	end
-	self:determine_chapter_click_handler()
 end
-function Timeline:on_wheel_up() mp.commandv('seek', options.timeline_step) end
-function Timeline:on_wheel_down() mp.commandv('seek', -options.timeline_step) end
+function Timeline:handle_wheel_up() mp.commandv('seek', options.timeline_step) end
+function Timeline:handle_wheel_down() mp.commandv('seek', -options.timeline_step) end
 
 function Timeline:render()
 	if self.size_max == 0 then return end
@@ -159,8 +140,23 @@ function Timeline:render()
 	local size_min = self:get_effective_size_min()
 	local size = self:get_effective_size()
 	local visibility = self:get_visibility()
+	self.is_hovered = false
 
-	if size < 1 then return end
+	if size < 1 then
+		if self.has_thumbnail then self:clear_thumbnail() end
+		return
+	end
+
+	if self.proximity_raw == 0 then
+		self.is_hovered = true
+		cursor.on_primary_down = function() self:handle_cursor_down() end
+		cursor.on_wheel_down = function() self:handle_wheel_down() end
+		cursor.on_wheel_up = function() self:handle_wheel_up() end
+	end
+
+	if self.pressed then
+		cursor.on_primary_up = function() self:handle_cursor_up() end
+	end
 
 	local ass = assdraw.ass_new()
 
@@ -251,7 +247,7 @@ function Timeline:render()
 	end
 
 	-- Chapters
-	self.hovered_chapter = nil
+	local hovered_chapter = nil
 	if (options.timeline_chapters_opacity > 0
 		and (#state.chapters > 0 or state.ab_loop_a or state.ab_loop_b)
 		) then
@@ -277,7 +273,7 @@ function Timeline:render()
 
 			if #state.chapters > 0 then
 				-- Find hovered chapter indicator
-				local hovered_chapter, closest_delta = nil, infinity
+				local closest_delta = INFINITY
 
 				if self.proximity_raw < diamond_radius_hovered then
 					for i, chapter in ipairs(state.chapters) do
@@ -285,6 +281,10 @@ function Timeline:render()
 						local cursor_chapter_delta = math.sqrt((cursor.x - chapter_x) ^ 2 + (cursor.y - chapter_y) ^ 2)
 						if cursor_chapter_delta <= diamond_radius_hovered and cursor_chapter_delta < closest_delta then
 							hovered_chapter, closest_delta = chapter, cursor_chapter_delta
+							self.is_hovered = true
+							cursor.on_primary_down = function()
+								mp.commandv('seek', hovered_chapter.time, 'absolute+exact')
+							end
 						end
 					end
 				end
@@ -294,11 +294,7 @@ function Timeline:render()
 				end
 
 				-- Render hovered chapter above others
-				if hovered_chapter then
-					draw_chapter(hovered_chapter.time, diamond_radius_hovered)
-					self.hovered_chapter = hovered_chapter
-					self:determine_chapter_click_handler()
-				end
+				if hovered_chapter then draw_chapter(hovered_chapter.time, diamond_radius_hovered) end
 			end
 
 			-- A-B loop indicators
@@ -366,10 +362,11 @@ function Timeline:render()
 	end
 
 	-- Hovered time and chapter
-	if (self.proximity_raw == 0 or self.pressed or self.hovered_chapter) and
+	local rendered_thumbnail = false
+	if (self.proximity_raw == 0 or self.pressed or hovered_chapter) and
 		not (Elements.speed and Elements.speed.dragging) then
-		local cursor_x = self.hovered_chapter and t2x(self.hovered_chapter.time) or cursor.x
-		local hovered_seconds = self.hovered_chapter and self.hovered_chapter.time or self:get_time_at_x(cursor.x)
+		local cursor_x = hovered_chapter and t2x(hovered_chapter.time) or cursor.x
+		local hovered_seconds = hovered_chapter and hovered_chapter.time or self:get_time_at_x(cursor.x)
 
 		-- Cursor line
 		-- 0.5 to switch when the pixel is half filled in
@@ -387,7 +384,11 @@ function Timeline:render()
 		tooltip_anchor.ay = tooltip_anchor.ay - self.font_size - offset
 
 		-- Thumbnail
-		if not thumbnail.disabled and thumbnail.width ~= 0 and thumbnail.height ~= 0 and not thumbnail.pause then
+		if not thumbnail.disabled
+			and (not self.pressed or self.pressed.distance < 5)
+			and thumbnail.width ~= 0
+			and thumbnail.height ~= 0
+		then
 			local scale_x, scale_y = display.scale_x, display.scale_y
 			local border, margin_x, margin_y = math.ceil(2 * scale_x), round(10 * scale_x), round(5 * scale_y)
 			local thumb_x_margin, thumb_y_margin = border + margin_x, border + margin_y
@@ -401,6 +402,7 @@ function Timeline:render()
 			local bx, by = (thumb_x + thumb_width + border) / scale_x, (thumb_y + thumb_height + border) / scale_y
 			ass:rect(ax, ay, bx, by, {color = bg, border = 1, border_color = fg, border_opacity = 0.08, radius = 2})
 			mp.commandv('script-message-to', 'thumbfast', 'thumb', hovered_seconds, thumb_x, thumb_y)
+			self.has_thumbnail, rendered_thumbnail = true, true
 			tooltip_anchor.ax, tooltip_anchor.bx, tooltip_anchor.ay = ax, bx, ay
 		end
 
@@ -416,6 +418,9 @@ function Timeline:render()
 		end
 	end
 
+	-- Clear thumbnail
+	if not rendered_thumbnail and self.has_thumbnail then self:clear_thumbnail() end
+
 	return ass
 end
 
diff --git a/portable_config/scripts/uosc/elements/TopBar.lua b/portable_config/scripts/uosc/elements/TopBar.lua
index ac59d8be..dba3c390 100644
--- a/portable_config/scripts/uosc/elements/TopBar.lua
+++ b/portable_config/scripts/uosc/elements/TopBar.lua
@@ -16,7 +16,7 @@ function TopBarButton:init(id, props)
 	self.command = props.command
 end
 
-function TopBarButton:on_mbtn_left_down()
+function TopBarButton:handle_cursor_down()
 	mp.command(type(self.command) == 'function' and self.command() or self.command)
 end
 
@@ -28,6 +28,7 @@ function TopBarButton:render()
 	-- Background on hover
 	if self.proximity_raw == 0 then
 		ass:rect(self.ax, self.ay, self.bx, self.by, {color = self.background, opacity = visibility})
+		cursor.on_primary_down = function() self:handle_cursor_down() end
 	end
 
 	local width, height = self.bx - self.ax, self.by - self.ay
@@ -52,8 +53,9 @@ function TopBar:init()
 	self.size_min_override = options.timeline_start_hidden and 0 or nil
 	self.top_border = options.timeline_border
 	self.show_alt_title = false
+	self.main_title, self.alt_title = nil, nil
 
-	local function decide_maximized_command()
+	local function get_maximized_command()
 		return state.border
 			and (state.fullscreen and 'set fullscreen no;cycle window-maximized' or 'cycle window-maximized')
 			or 'set window-maximized no;cycle fullscreen'
@@ -62,9 +64,11 @@ function TopBar:init()
 	-- Order aligns from right to left
 	self.buttons = {
 		TopBarButton:new('tb_close', {icon = 'close', background = '2311e8', command = 'quit'}),
-		TopBarButton:new('tb_max', {icon = 'crop_square', background = '222222', command = decide_maximized_command}),
+		TopBarButton:new('tb_max', {icon = 'crop_square', background = '222222', command = get_maximized_command}),
 		TopBarButton:new('tb_min', {icon = 'minimize', background = '222222', command = 'cycle window-minimized'}),
 	}
+
+	self:decide_titles()
 end
 
 function TopBar:decide_enabled()
@@ -79,6 +83,32 @@ function TopBar:decide_enabled()
 	end
 end
 
+function TopBar:decide_titles()
+	self.alt_title = state.alt_title ~= '' and state.alt_title or nil
+	self.main_title = state.title ~= '' and state.title or nil
+
+	-- Fall back to alt title if main is empty
+	if not self.main_title then
+		self.main_title, self.alt_title = self.alt_title, nil
+	end
+
+	-- Deduplicate the main and alt titles by checking if one completely
+	-- contains the other, and using only the longer one.
+	if self.main_title and self.alt_title and not self.show_alt_title then
+		local longer_title, shorter_title
+		if #self.main_title < #self.alt_title then
+			longer_title, shorter_title = self.alt_title, self.main_title
+		else
+			longer_title, shorter_title = self.main_title, self.alt_title
+		end
+
+		local escaped_shorter_title = string.gsub(shorter_title --[[@as string]], "[%(%)%.%+%-%*%?%[%]%^%$%%]", "%%%1")
+		if string.match(longer_title --[[@as string]], escaped_shorter_title) then
+			self.main_title, self.alt_title = longer_title, nil
+		end
+	end
+end
+
 function TopBar:update_dimensions()
 	self.size = state.fullormaxed and options.top_bar_size_fullscreen or options.top_bar_size
 	self.icon_size = round(self.size * 0.5)
@@ -104,6 +134,9 @@ function TopBar:toggle_title()
 	self.show_alt_title = not self.show_alt_title
 end
 
+function TopBar:on_prop_title() self:decide_titles() end
+function TopBar:on_prop_alt_title() self:decide_titles() end
+
 function TopBar:on_prop_border()
 	self:decide_enabled()
 	self:update_dimensions()
@@ -119,10 +152,6 @@ function TopBar:on_prop_maximized()
 	self:update_dimensions()
 end
 
-function TopBar:on_mbtn_left_down()
-	if cursor.x < self.title_bx then self:toggle_title() end
-end
-
 function TopBar:on_display() self:update_dimensions() end
 
 function TopBar:render()
@@ -148,55 +177,71 @@ function TopBar:render()
 			ass:rect(title_ax, title_ay, bx, self.by - bg_margin, {color = fg, opacity = visibility, radius = 2})
 			ass:txt(title_ax + (bx - title_ax) / 2, self.ay + (self.size / 2), 5, formatted_text, opts)
 			title_ax = bx + bg_margin
-		end
-
-		-- Title
-		local text = self.show_alt_title and state.alt_title or state.title
-		if max_bx - title_ax > self.font_size * 3 and text and text ~= '' then
-			local opts = {
-				size = self.font_size, wrap = 2, color = bgt, border = 1, border_color = bg, opacity = visibility,
-				clip = string.format('\\clip(%d, %d, %d, %d)', self.ax, self.ay, max_bx, self.by),
-			}
-			local bx = math.min(max_bx, title_ax + text_width(text, opts) + padding * 2)
-			local by = self.by - bg_margin
-			ass:rect(title_ax, title_ay, bx, by, {
-				color = bg, opacity = visibility * options.top_bar_title_opacity, radius = 2,
-			})
-			ass:txt(title_ax + padding, self.ay + (self.size / 2), 4, text, opts)
-			title_ay = by + 1
-		end
+			local rect = {ax = self.ax, ay = self.ay, bx = bx, by = self.by}
 
-		-- Alt title
-		if state.alt_title and options.top_bar_alt_title_place == 'below' and state.alt_title ~= state.title then
-			local font_size = self.font_size * 0.9
-			local height = font_size * 1.3
-			local by = title_ay + height
-			local opts = {size = font_size, wrap = 2, color = bgt, border = 1, border_color = bg, opacity = visibility}
-			local bx = math.min(max_bx, title_ax + text_width(state.alt_title, opts) + padding * 2)
-			opts.clip = string.format('\\clip(%d, %d, %d, %d)', title_ax, title_ay, bx, by)
-			ass:rect(title_ax, title_ay, bx, by, {
-				color = bg, opacity = visibility * options.top_bar_title_opacity, radius = 2,
-			})
-			ass:txt(title_ax + padding, title_ay + height / 2, 4, state.alt_title, opts)
-			title_ay = by + 1
+			if get_point_to_rectangle_proximity(cursor, rect) == 0 then
+				cursor.on_primary_down = function() mp.command('script-binding uosc/playlist') end
+			end
 		end
 
-		-- Subtitle: current chapter
-		if state.current_chapter and max_bx - title_ax > self.font_size * 3 then
-			local font_size = self.font_size * 0.8
-			local height = font_size * 1.3
-			local text = '└ ' .. state.current_chapter.index .. ': ' .. state.current_chapter.title
-			local by = title_ay + height
-			local opts = {
-				size = font_size, italic = true, wrap = 2, color = bgt,
-				border = 1, border_color = bg, opacity = visibility * 0.8,
-			}
-			local bx = math.min(max_bx, title_ax + text_width(text, opts) + padding * 2)
-			opts.clip = string.format('\\clip(%d, %d, %d, %d)', title_ax, title_ay, bx, by)
-			ass:rect(title_ax, title_ay, bx, by, {
-				color = bg, opacity = visibility * options.top_bar_title_opacity, radius = 2,
-			})
-			ass:txt(title_ax + padding, title_ay + height / 2, 4, text, opts)
+		-- Skip rendering titles if there's not enough horizontal space
+		if max_bx - title_ax > self.font_size * 3 then
+			-- Main title
+			local main_title = self.show_alt_title and self.alt_title or self.main_title
+			if main_title then
+				local opts = {
+					size = self.font_size, wrap = 2, color = bgt, border = 1, border_color = bg, opacity = visibility,
+					clip = string.format('\\clip(%d, %d, %d, %d)', self.ax, self.ay, max_bx, self.by),
+				}
+				local bx = math.min(max_bx, title_ax + text_width(main_title, opts) + padding * 2)
+				local by = self.by - bg_margin
+				local rect = {ax = title_ax, ay = self.ay, bx = self.title_bx, by = self.by}
+
+				if get_point_to_rectangle_proximity(cursor, rect) == 0 then
+					cursor.on_primary_down = function() self:toggle_title() end
+				end
+
+				ass:rect(title_ax, title_ay, bx, by, {
+					color = bg, opacity = visibility * options.top_bar_title_opacity, radius = 2,
+				})
+				ass:txt(title_ax + padding, self.ay + (self.size / 2), 4, main_title, opts)
+				title_ay = by + 1
+			end
+
+			-- Alt title
+			if self.alt_title and options.top_bar_alt_title_place == 'below' then
+				local font_size = self.font_size * 0.9
+				local height = font_size * 1.3
+				local by = title_ay + height
+				local opts = {
+					size = font_size, wrap = 2, color = bgt, border = 1, border_color = bg, opacity = visibility
+				}
+				local bx = math.min(max_bx, title_ax + text_width(self.alt_title, opts) + padding * 2)
+				opts.clip = string.format('\\clip(%d, %d, %d, %d)', title_ax, title_ay, bx, by)
+				ass:rect(title_ax, title_ay, bx, by, {
+					color = bg, opacity = visibility * options.top_bar_title_opacity, radius = 2,
+				})
+				ass:txt(title_ax + padding, title_ay + height / 2, 4, self.alt_title, opts)
+				title_ay = by + 1
+			end
+
+			-- Subtitle: current chapter
+			if state.current_chapter then
+				local font_size = self.font_size * 0.8
+				local height = font_size * 1.3
+				local text = '└ ' .. state.current_chapter.index .. ': ' .. state.current_chapter.title
+				local by = title_ay + height
+				local opts = {
+					size = font_size, italic = true, wrap = 2, color = bgt,
+					border = 1, border_color = bg, opacity = visibility * 0.8,
+				}
+				local bx = math.min(max_bx, title_ax + text_width(text, opts) + padding * 2)
+				opts.clip = string.format('\\clip(%d, %d, %d, %d)', title_ax, title_ay, bx, by)
+				ass:rect(title_ax, title_ay, bx, by, {
+					color = bg, opacity = visibility * options.top_bar_title_opacity, radius = 2,
+				})
+				ass:txt(title_ax + padding, title_ay + height / 2, 4, text, opts)
+			end
 		end
 	end
 
diff --git a/portable_config/scripts/uosc/elements/Volume.lua b/portable_config/scripts/uosc/elements/Volume.lua
index 8934f8b6..356feca9 100644
--- a/portable_config/scripts/uosc/elements/Volume.lua
+++ b/portable_config/scripts/uosc/elements/Volume.lua
@@ -7,10 +7,12 @@ local MuteButton = class(Element)
 ---@param props? ElementProps
 function MuteButton:new(props) return Class.new(self, 'volume_mute', props) --[[@as MuteButton]] end
 function MuteButton:get_visibility() return Elements.volume:get_visibility(self) end
-function MuteButton:on_mbtn_left_down() mp.commandv('cycle', 'mute') end
 function MuteButton:render()
 	local visibility = self:get_visibility()
 	if visibility <= 0 then return end
+	if self.proximity_raw == 0 then
+		cursor.on_primary_down = function() mp.commandv('cycle', 'mute') end
+	end
 	local ass = assdraw.ass_new()
 	local icon_name = state.mute and 'volume_off' or 'volume_up'
 	local width = self.bx - self.ax
@@ -58,17 +60,11 @@ function VolumeSlider:on_coordinates()
 	self.spacing = round(width * 0.2)
 	self.radius = math.max(2, (self.bx - self.ax) / 10)
 end
-function VolumeSlider:on_mbtn_left_down()
-	self.pressed = true
-	self:set_from_cursor()
-end
-function VolumeSlider:on_global_mbtn_left_up() self.pressed = false end
-function VolumeSlider:on_global_mouse_leave() self.pressed = false end
 function VolumeSlider:on_global_mouse_move()
 	if self.pressed then self:set_from_cursor() end
 end
-function VolumeSlider:on_wheel_up() self:set_volume(state.volume + options.volume_step) end
-function VolumeSlider:on_wheel_down() self:set_volume(state.volume - options.volume_step) end
+function VolumeSlider:handle_wheel_up() self:set_volume(state.volume + options.volume_step) end
+function VolumeSlider:handle_wheel_down() self:set_volume(state.volume - options.volume_step) end
 
 function VolumeSlider:render()
 	local visibility = self:get_visibility()
@@ -77,8 +73,21 @@ function VolumeSlider:render()
 
 	if width <= 0 or height <= 0 or visibility <= 0 then return end
 
+	if self.proximity_raw == 0 then
+		cursor.on_primary_down = function()
+			self.pressed = true
+			self:set_from_cursor()
+			cursor.on_primary_up = function() self.pressed = false end
+		end
+		cursor.on_wheel_down = function() self:handle_wheel_down() end
+		cursor.on_wheel_up = function() self:handle_wheel_up() end
+	end
+	if self.pressed then cursor.on_primary_up = function()
+		self.pressed = false end
+	end
+
 	local ass = assdraw.ass_new()
-	local nudge_y, nudge_size = self.draw_nudge and self.nudge_y or -infinity, self.nudge_size
+	local nudge_y, nudge_size = self.draw_nudge and self.nudge_y or -INFINITY, self.nudge_size
 	local volume_y = self.ay + options.volume_border +
 		((height - (options.volume_border * 2)) * (1 - math.min(state.volume / state.volume_max, 1)))
 
@@ -90,7 +99,7 @@ function VolumeSlider:render()
 		local ax, bx, by = ax + p, bx - p, by - p
 		local r = math.max(1, self.radius - p)
 		local d, rh = r * 2, r / 2
-		local nudge_size = ((quarter_pi_sin * (nudge_size - p)) + p) / quarter_pi_sin
+		local nudge_size = ((QUARTER_PI_SIN * (nudge_size - p)) + p) / QUARTER_PI_SIN
 		local path = assdraw.ass_new()
 		path:move_to(bx - r, by)
 		path:line_to(ax + r, by)
diff --git a/portable_config/scripts/uosc/lib/menus.lua b/portable_config/scripts/uosc/lib/menus.lua
index 3c2de237..fce1443b 100644
--- a/portable_config/scripts/uosc/lib/menus.lua
+++ b/portable_config/scripts/uosc/lib/menus.lua
@@ -12,7 +12,7 @@ function open_command_menu(data, opts)
 	---@type MenuOptions
 	local menu_opts = {}
 	if opts then
-		menu_opts.submenu, menu_opts.mouse_nav = opts.submenu, opts.mouse_nav
+		menu_opts.mouse_nav = opts.mouse_nav
 		if opts.on_close then menu_opts.on_close = function() run_command(opts.on_close) end end
 	end
 	local menu = Menu:open(data, run_command, menu_opts)
@@ -26,7 +26,7 @@ function toggle_menu_with_items(opts)
 	else open_command_menu({type = 'menu', items = config.menu_items}, opts) end
 end
 
----@param options {type: string; title: string; list_prop: string; active_prop?: string; serializer: fun(list: any, active: any): MenuDataItem[]; on_select: fun(value: any)}
+---@param options {type: string; title: string; list_prop: string; active_prop?: string; serializer: fun(list: any, active: any): MenuDataItem[]; on_select: fun(value: any); on_move_item?: fun(from_index: integer, to_index: integer, submenu_path: integer[]); on_delete_item?: fun(index: integer, submenu_path: integer[])}
 function create_self_updating_menu_opener(options)
 	return function()
 		if Menu:is_open(options.type) then Menu:close() return end
@@ -65,6 +65,8 @@ function create_self_updating_menu_opener(options)
 				mp.unobserve_property(handle_list_prop_change)
 				mp.unobserve_property(handle_active_prop_change)
 			end,
+			on_move_item = options.on_move_item,
+			on_delete_item = options.on_delete_item,
 		})
 	end
 end
@@ -178,7 +180,7 @@ function open_file_navigation_menu(directory_path, handle_select, opts)
 	local items = {}
 
 	if is_root then
-		if state.os == 'windows' then
+		if state.platform == 'windows' then
 			items[#items + 1] = {title = '..', hint = '驱动器列表', value = '{drives}', separator = true}
 		end
 	else
diff --git a/portable_config/scripts/uosc/lib/text.lua b/portable_config/scripts/uosc/lib/text.lua
index e310f2b6..b18a7515 100644
--- a/portable_config/scripts/uosc/lib/text.lua
+++ b/portable_config/scripts/uosc/lib/text.lua
@@ -251,8 +251,8 @@ do
 		local unicode = utf8_to_unicode(char, 1)
 		for _, block in ipairs(zero_width_blocks) do
 			if unicode >= block[1] and unicode <= block[2] then
-				char_widths[char] = {0, infinity}
-				return 0, infinity
+				char_widths[char] = {0, INFINITY}
+				return 0, INFINITY
 			end
 		end
 
@@ -302,7 +302,7 @@ end
 ---@return number, integer
 local function character_based_width(text, bold)
 	local max_width = 0
-	local min_px = infinity
+	local min_px = INFINITY
 	for line in tostring(text):gmatch("([^\n]*)\n?") do
 		local total_width = 0
 		for _, char in utf8_iter(line) do
diff --git a/portable_config/scripts/uosc/lib/utils.lua b/portable_config/scripts/uosc/lib/utils.lua
index 73f86290..e07c10d3 100644
--- a/portable_config/scripts/uosc/lib/utils.lua
+++ b/portable_config/scripts/uosc/lib/utils.lua
@@ -5,7 +5,7 @@ sort_filenames = (function()
 	local symbol_order
 	local default_order
 
-	if state.os == 'windows' then
+	if state.platform == 'windows' then
 		symbol_order = {
 			['!'] = 1, ['#'] = 2, ['$'] = 3, ['%'] = 4, ['&'] = 5, ['('] = 6, [')'] = 6, [','] = 7,
 			['.'] = 8, ["'"] = 9, ['-'] = 10, [';'] = 11, ['@'] = 12, ['['] = 13, [']'] = 13, ['^'] = 14,
@@ -93,6 +93,18 @@ function get_point_to_rectangle_proximity(point, rect)
 	return math.sqrt(dx * dx + dy * dy)
 end
 
+---@param point_a {x: number; y: number}
+---@param point_b {x: number; y: number}
+function get_point_to_point_proximity(point_a, point_b)
+	local dx, dy = point_a.x - point_b.x, point_a.y - point_b.y
+	return math.sqrt(dx * dx + dy * dy)
+end
+
+-- Call function with args if it exists
+function call_maybe(fn, ...)
+	if type(fn) == 'function' then fn(...) end
+end
+
 -- Extracts the properties used by property expansion of that string.
 ---@param str string
 ---@param res { [string] : boolean } | nil
@@ -155,7 +167,7 @@ function opacity_to_alpha(opacity)
 end
 
 path_separator = (function()
-	local os_separator = state.os == 'windows' and '\\' or '/'
+	local os_separator = state.platform == 'windows' and '\\' or '/'
 
 	-- Get appropriate path separator for the given path.
 	---@param path string
@@ -181,7 +193,7 @@ end
 ---@return boolean
 function is_absolute(path)
 	if path:sub(1, 2) == '\\\\' then return true
-	elseif state.os == 'windows' then return path:find('^%a+:') ~= nil
+	elseif state.platform == 'windows' then return path:find('^%a+:') ~= nil
 	else return path:sub(1, 1) == '/' end
 end
 
@@ -199,7 +211,7 @@ end
 function trim_trailing_separator(path)
 	local separator = path_separator(path)
 	path = trim_end(path, separator)
-	if state.os == 'windows' then
+	if state.platform == 'windows' then
 		-- Drive letters on windows need trailing backslash
 		if path:sub(#path) == ':' then path = path .. '\\' end
 	else
@@ -226,12 +238,12 @@ function normalize_path(path)
 
 	path = ensure_absolute(path)
 	local is_unc = path:sub(1, 2) == '\\\\'
-	if state.os == 'windows' or is_unc then path = path:gsub('/', '\\') end
+	if state.platform == 'windows' or is_unc then path = path:gsub('/', '\\') end
 	path = trim_trailing_separator(path)
 
 	--Deduplication of path separators
 	if is_unc then path = path:gsub('(.\\)\\+', '%1')
-	elseif state.os == 'windows' then path = path:gsub('\\\\+', '\\')
+	elseif state.platform == 'windows' then path = path:gsub('\\\\+', '\\')
 	else path = path:gsub('//+', '/') end
 
 	return path
@@ -393,7 +405,7 @@ end
 -- `status:number(<0=error), stdout, stderr, error_string, killed_by_us:boolean`
 ---@param path string
 function delete_file(path)
-	if state.os == 'windows' then
+	if state.platform == 'windows' then
 		if options.use_trash then
 			local ps_code = [[
 				Add-Type -AssemblyName Microsoft.VisualBasic
@@ -428,10 +440,23 @@ end
 function serialize_chapter_ranges(normalized_chapters)
 	local ranges = {}
 	local simple_ranges = {
-		{name = 'openings', patterns = {'^op ', '^op$', ' op$', 'opening$'}, requires_next_chapter = true},
-		{name = 'intros', patterns = {'^intro$'}, requires_next_chapter = true},
-		{name = 'endings', patterns = {'^ed ', '^ed$', ' ed$', 'ending$', 'closing$'}},
-		{name = 'outros', patterns = {'^outro$'}},
+		{name = 'openings', patterns = {
+				'^op ', '^op$', ' op$',
+				'^opening$', ' opening$'
+			}, requires_next_chapter = true},
+		{name = 'intros', patterns = {
+				'^intro$', ' intro$',
+				'^avant$', '^prologue$'
+			}, requires_next_chapter = true},
+		{name = 'endings', patterns = {
+				'^ed ', '^ed$', ' ed$',
+				'^ending ', '^ending$', ' ending$',
+			}},
+		{name = 'outros', patterns = {
+				'^outro$', ' outro$',
+				'^closing$', '^closing ',
+				'^preview$', '^pv$',
+			}},
 	}
 	local sponsor_ranges = {}
 
@@ -455,7 +480,7 @@ function serialize_chapter_ranges(normalized_chapters)
 					if next_chapter or not meta.requires_next_chapter then
 						ranges[#ranges + 1] = table_assign({
 							start = chapter.time,
-							['end'] = next_chapter and next_chapter.time or infinity,
+							['end'] = next_chapter and next_chapter.time or INFINITY,
 						}, config.chapter_ranges[meta.name])
 					end
 				end
@@ -484,7 +509,7 @@ function serialize_chapter_ranges(normalized_chapters)
 				local next_chapter = chapters[i + 1]
 				ranges[#ranges + 1] = table_assign({
 					start = chapter.time,
-					['end'] = next_chapter and next_chapter.time or infinity,
+					['end'] = next_chapter and next_chapter.time or INFINITY,
 				}, config.chapter_ranges.ads)
 			end
 		end
@@ -540,6 +565,8 @@ function render()
 	if not display.initialized then return end
 	state.render_last_time = mp.get_time()
 
+	cursor.reset_handlers()
+
 	-- Actual rendering
 	local ass = assdraw.ass_new()
 
@@ -553,6 +580,8 @@ function render()
 		end
 	end
 
+	cursor.decide_keybinds()
+
 	-- submit
 	if osd.res_x == display.width and osd.res_y == display.height and osd.data == ass.text then
 		return
diff --git a/portable_config/scripts/uosc/main.lua b/portable_config/scripts/uosc/main.lua
index 3eefae1c..4b003286 100644
--- a/portable_config/scripts/uosc/main.lua
+++ b/portable_config/scripts/uosc/main.lua
@@ -1,6 +1,6 @@
 --[[
 SOURCE_ https://github.com/tomasklaen/uosc/tree/main/scripts
-COMMIT_ 94ec120923cfdc973cb30a5acdb192c8ae005c19
+COMMIT_ 808fa5941842e3fdf115e3433d828ee56c1e4456
 
 极简主义设计驱动的多功能界面脚本群组，兼容 thumbfast 新缩略图引擎
 ]]--
@@ -13,8 +13,8 @@ opt = require('mp.options')
 utils = require('mp.utils')
 msg = require('mp.msg')
 osd = mp.create_osd_overlay('ass-events')
-infinity = 1e309
-quarter_pi_sin = math.sin(math.pi / 4)
+INFINITY = 1e309
+QUARTER_PI_SIN = math.sin(math.pi / 4)
 
 --[[ OPTIONS ]]
 
@@ -101,7 +101,7 @@ defaults = {
 	curtain_opacity = 0.5,
 	stream_quality_options = '4320,2160,1440,1080,720,480,360,240,144',
 	video_types= '3g2,3gp,asf,avi,f4v,flv,h264,h265,m2ts,m4v,mkv,mov,mp4,mp4v,mpeg,mpg,ogm,ogv,rm,rmvb,ts,vob,webm,wmv,y4m',
-	audio_types= 'aac,aiff,ape,au,dsf,dts,flac,m4a,mid,midi,mka,mp3,mp4a,oga,ogg,opus,spx,tak,tta,wav,weba,wma,wv',
+	audio_types= 'aac,ac3,aiff,ape,au,dsf,dts,flac,m4a,mid,midi,mka,mp3,mp4a,oga,ogg,opus,spx,tak,tta,wav,weba,wma,wv',
 	image_types= 'apng,avif,bmp,gif,j2k,jp2,jfif,jpeg,jpg,jxl,mj2,png,svg,tga,tif,tiff,webp',
 	subtitle_types = 'aqt,ass,gsub,idx,jss,lrc,mks,pgs,pjs,psb,rt,slt,smi,sub,sup,srt,ssa,ssf,ttxt,txt,usf,vt,vtt',
 	default_directory = '~/',
@@ -316,12 +316,62 @@ end
 --[[ STATE ]]
 
 display = {width = 1280, height = 720, scale_x = 1, scale_y = 1, initialized = false}
-cursor = {hidden = true, hover_raw = false, x = 0, y = 0}
+cursor = {
+	x = 0,
+	y = 0,
+	hidden = true,
+	hover_raw = false,
+	-- Event handlers that are only fired on cursor, bound during render loop. Guidelines:
+	-- - element activations (clicks) go to `mbtn_left_down` handler
+	-- - `mbtn_button_up` is only for clearing dragging/swiping
+	on_primary_down = nil,
+	on_primary_up = nil,
+	on_wheel_down = nil,
+	on_wheel_up = nil,
+	-- Called at the beginning of each render
+	reset_handlers = function()
+		cursor.on_primary_down, cursor.on_primary_up = nil, nil
+		cursor.on_wheel_down, cursor.on_wheel_up = nil, nil
+	end,
+	mbtn_left_enabled = nil,
+	wheel_enabled = nil,
+	-- Enables pointer key group captures needed by handlers (called at the end of each render)
+	decide_keybinds = function()
+		local enable_mbtn_left = (cursor.on_primary_down or cursor.on_primary_up) ~= nil
+		local enable_wheel = (cursor.on_wheel_down or cursor.on_wheel_up) ~= nil
+		if enable_mbtn_left ~= cursor.mbtn_left_enabled then
+			mp[(enable_mbtn_left and 'enable' or 'disable') .. '_key_bindings']('mbtn_left')
+			cursor.mbtn_left_enabled = enable_mbtn_left
+		end
+		if enable_wheel ~= cursor.wheel_enabled then
+			mp[(enable_wheel and 'enable' or 'disable') .. '_key_bindings']('wheel')
+			cursor.wheel_enabled = enable_wheel
+		end
+	end,
+	-- Cursor auto-hiding after period of inactivity
+	autohide = function()
+		if not Menu:is_open() then handle_mouse_leave() end
+	end,
+	autohide_timer = mp.add_timeout(mp.get_property_native('cursor-autohide') / 1000, function()
+		cursor.autohide()
+	end),
+	queue_autohide = function()
+		if options.autohide then
+			cursor.autohide_timer:kill()
+			cursor.autohide_timer:resume()
+		end
+	end
+}
 state = {
-	os = (function()
-		if os.getenv('windir') ~= nil then return 'windows' end
-		local homedir = os.getenv('HOME')
-		if homedir ~= nil and string.sub(homedir, 1, 6) == '/Users' then return 'macos' end
+	platform = (function()
+		local platform = mp.get_property_native('platform')
+		if platform then
+			if itable_index_of({'windows', 'darwin'}, platform) then return platform end
+		else
+			if os.getenv('windir') ~= nil then return 'windows' end
+			local homedir = os.getenv('HOME')
+			if homedir ~= nil and string.sub(homedir, 1, 6) == '/Users' then return 'darwin' end
+		end
 		return 'linux'
 	end)(),
 	cwd = mp.get_property('working-directory'),
@@ -356,10 +406,6 @@ state = {
 	has_chapter = false,
 	has_playlist = false,
 	shuffle = options.shuffle,
-	cursor_autohide_timer = mp.add_timeout(mp.get_property_native('cursor-autohide') / 1000, function()
-		if not options.autohide then return end
-		handle_mouse_leave()
-	end),
 	mouse_bindings_enabled = false,
 	uncached_ranges = nil,
 	cache = nil,
@@ -375,7 +421,7 @@ state = {
 	margin_bottom = 0,
 	hidpi_scale = 1,
 }
-thumbnail = {width = 0, height = 0, disabled = false, pause = false}
+thumbnail = {width = 0, height = 0, disabled = false}
 external = {} -- Properties set by external scripts
 key_binding_overwrites = {} -- Table of key_binding:mpv_command
 Elements = require('elements/Elements')
@@ -420,6 +466,7 @@ function update_fullormaxed()
 	state.fullormaxed = state.fullscreen or state.maximized
 	update_display_dimensions()
 	Elements:trigger('prop_fullormaxed', state.fullormaxed)
+	handle_mouse_move(INFINITY, INFINITY)
 end
 
 function update_human_times()
@@ -482,7 +529,7 @@ function update_cursor_position(x, y)
 	-- we receive a first real mouse move event with coordinates other than 0,0.
 	if not state.first_real_mouse_move_received then
 		if x > 0 and y > 0 then state.first_real_mouse_move_received = true
-		else x, y = infinity, infinity end
+		else x, y = INFINITY, INFINITY end
 	end
 
 	-- add 0.5 to be in the middle of the pixel
@@ -518,12 +565,7 @@ function handle_mouse_move(x, y)
 	update_cursor_position(x, y)
 	Elements:proximity_trigger('mouse_move')
 	request_render()
-
-	-- Restart timer that hides UI when mouse is autohidden
-	if options.autohide then
-		state.cursor_autohide_timer:kill()
-		state.cursor_autohide_timer:resume()
-	end
+	cursor.queue_autohide()
 end
 
 function handle_file_end()
@@ -687,16 +729,15 @@ mp.observe_property('duration', 'number', create_state_setter('duration', update
 mp.observe_property('speed', 'number', create_state_setter('speed', update_human_times))
 mp.observe_property('track-list', 'native', function(name, value)
 	-- checks the file dispositions
-	local is_image = false
-	local types = {sub = 0, audio = 0, video = 0}
+	local types = {sub = 0, image = 0, audio = 0, video = 0}
 	for _, track in ipairs(value) do
 		if track.type == 'video' then
-			is_image = track.image
-			if not is_image and not track.albumart then types.video = types.video + 1 end
+			if track.image or track.albumart then types.image = types.image + 1
+			else types.video = types.video + 1 end
 		elseif types[track.type] then types[track.type] = types[track.type] + 1 end
 	end
 	set_state('is_audio', types.video == 0 and types.audio > 0)
-	set_state('is_image', is_image)
+	set_state('is_image', types.image > 0 and types.video == 0 and types.audio == 0)
 	set_state('has_audio', types.audio > 0)
 	set_state('has_many_audio', types.audio > 1)
 	set_state('has_sub', types.sub > 0)
@@ -800,6 +841,23 @@ mp.observe_property('core-idle', 'native', create_state_setter('core_idle'))
 
 --[[ KEY BINDS ]]
 
+-- Pointer related binding groups
+mp.set_key_bindings({
+	{
+		'mbtn_left',
+		function(...) call_maybe(cursor.on_primary_up, ...) end,
+		function(...)
+			update_mouse_pos(nil, mp.get_property_native('mouse-pos'))
+			call_maybe(cursor.on_primary_down, ...)
+		end,
+	},
+	{'mbtn_left_dbl', 'ignore'},
+}, 'mbtn_left', 'force')
+mp.set_key_bindings({
+	{'wheel_up', function(...) call_maybe(cursor.on_wheel_up, ...) end},
+	{'wheel_down', function(...) call_maybe(cursor.on_wheel_down, ...) end},
+}, 'wheel', 'force')
+
 -- Adds a key binding that respects rerouting set by `key_binding_overwrites` table.
 ---@param name string
 ---@param callback fun(event: table)
@@ -889,6 +947,10 @@ bind_command('playlist', create_self_updating_menu_opener({
 		return items
 	end,
 	on_select = function(index) mp.commandv('set', 'playlist-pos-1', tostring(index)) end,
+	on_move_item = function(from, to)
+		mp.commandv('playlist-move', tostring(math.max(from, to) - 1), tostring(math.min(from, to) - 1))
+	end,
+	on_delete_item = function(index) mp.commandv('playlist-remove', tostring(index - 1)) end,
 }))
 bind_command('chapters', create_self_updating_menu_opener({
 	title = '章节列表',
@@ -933,11 +995,11 @@ bind_command('show-in-directory', function()
 	-- Ignore URLs
 	if not state.path or is_protocol(state.path) then return end
 
-	if state.os == 'windows' then
+	if state.platform == 'windows' then
 		utils.subprocess_detached({args = {'explorer', '/select,', state.path}, cancellable = false})
-	elseif state.os == 'macos' then
+	elseif state.platform == 'macos' then
 		utils.subprocess_detached({args = {'open', '-R', state.path}, cancellable = false})
-	elseif state.os == 'linux' then
+	elseif state.platform == 'linux' then
 		local result = utils.subprocess({args = {'nautilus', state.path}, cancellable = false})
 
 		-- Fallback opens the folder with xdg-open instead
@@ -1118,11 +1180,11 @@ bind_command('open-config-directory', function()
 	if config then
 		local args
 
-		if state.os == 'windows' then
+		if state.platform == 'windows' then
 			args = {'explorer', '/select,', config.path}
-		elseif state.os == 'macos' then
+		elseif state.platform == 'macos' then
 			args = {'open', '-R', config.path}
-		elseif state.os == 'linux' then
+		elseif state.platform == 'linux' then
 			args = {'xdg-open', config.dirname}
 		end
 
diff --git a/portable_config/shaders/gaussianBlur_next.glsl b/portable_config/shaders/gaussianBlur_next.glsl
new file mode 100644
index 00000000..424444eb
--- /dev/null
+++ b/portable_config/shaders/gaussianBlur_next.glsl
@@ -0,0 +1,65 @@
+//!HOOK MAIN
+//!BIND HOOKED
+//!SAVE PASS0
+//!DESC gaussian blur pass0
+
+vec4 hook() {
+    return linearize(textureLod(HOOKED_raw, HOOKED_pos, 0.0) * HOOKED_mul);
+}
+
+//!HOOK MAIN
+//!BIND PASS0
+//!SAVE PASS1
+//!DESC gaussian blur pass1
+
+////////////////////////////////////////////////////////////////////////
+// USER CONFIGURABLE, PASS 1 (blur in y axis)
+//
+// CAUTION! probably should use the same settings for "USER CONFIGURABLE, PASS 2" below
+//
+#define SIGMA 1.0 //blur spread or amount, (0.0, 10+]
+#define RADIUS 3.0 //kernel radius (integer as float, e.g. 3.0), (0.0, 10+]; probably should set it to ceil(3 * SIGMA)
+//
+////////////////////////////////////////////////////////////////////////
+
+#define get_weight(x) (exp(-x * x / (2.0 * SIGMA * SIGMA)))
+
+vec4 hook() {
+    float weight;
+    vec4 csum = textureLod(PASS0_raw, PASS0_pos, 0.0) * PASS0_mul;
+    float wsum = 1.0;
+    for(float i = 1.0; i <= RADIUS; ++i) {
+        weight = get_weight(i);
+        csum += (textureLod(PASS0_raw, PASS0_pos + PASS0_pt * vec2(0.0, -i), 0.0) + textureLod(PASS0_raw, PASS0_pos + PASS0_pt * vec2(0.0, i), 0.0)) * PASS0_mul * weight;
+        wsum += 2.0 * weight;
+    }
+    return csum / wsum;
+}
+
+//!HOOK MAIN
+//!BIND PASS1
+//!DESC gaussian blur pass2
+
+////////////////////////////////////////////////////////////////////////
+// USER CONFIGURABLE, PASS 2 (blur in x axis)
+//
+// CAUTION! probably should use the same settings for "USER CONFIGURABLE, PASS 1" above
+//
+#define SIGMA 1.0 //blur spread or amount, (0.0, 10+]
+#define RADIUS 3.0 //kernel radius (integer as float, e.g. 3.0), (0.0, 10+]; probably should set it to ceil(3 * SIGMA)
+//
+////////////////////////////////////////////////////////////////////////
+
+#define get_weight(x) (exp(-x * x / (2.0 * SIGMA * SIGMA)))
+
+vec4 hook() {
+    float weight;
+    vec4 csum = textureLod(PASS1_raw, PASS1_pos, 0.0) * PASS1_mul;
+    float wsum = 1.0;
+    for(float i = 1.0; i <= RADIUS; ++i) {
+        weight = get_weight(i);
+        csum += (textureLod(PASS1_raw, PASS1_pos + PASS1_pt * vec2(-i, 0.0), 0.0) + textureLod(PASS1_raw, PASS1_pos + PASS1_pt * vec2(i, 0.0), 0.0)) * PASS1_mul * weight;
+        wsum += 2.0 * weight;
+    }
+    return delinearize(csum / wsum);
+}
diff --git a/portable_config/shaders/guided.glsl b/portable_config/shaders/guided.glsl
index 7d290255..3c804da3 100644
--- a/portable_config/shaders/guided.glsl
+++ b/portable_config/shaders/guided.glsl
@@ -77,8 +77,8 @@ vec4 hook()
 //!HOOK RGB
 //!DESC Guided filter (MEANI)
 //!BIND I
-//!WIDTH I.w 2.0 /
-//!HEIGHT I.h 2.0 /
+//!WIDTH I.w 1.5 /
+//!HEIGHT I.h 1.5 /
 //!SAVE MEANI
 
 vec4 hook()
@@ -169,7 +169,7 @@ vec4 hook()
 //!HEIGHT I.h
 //!SAVE A
 
-#define E 0.001
+#define E 0.0013
 
 vec4 hook()
 {
diff --git a/portable_config/shaders/guided_lgc.glsl b/portable_config/shaders/guided_lgc.glsl
index 9207f4c4..5278740d 100644
--- a/portable_config/shaders/guided_lgc.glsl
+++ b/portable_config/shaders/guided_lgc.glsl
@@ -16,7 +16,7 @@
  * along with this program. If not, see <https://www.gnu.org/licenses/>.
  */
 
-//desc: Luma-guided-chroma guided filter
+//desc: Luma-guided-chroma denoising.
 
 /* The radius can be adjusted with the MEANI stage's downscaling factor. 
  * Higher numbers give a bigger radius.
diff --git a/portable_config/shaders/guided_s.glsl b/portable_config/shaders/guided_s.glsl
index 0bf3ad84..cc8f4467 100644
--- a/portable_config/shaders/guided_s.glsl
+++ b/portable_config/shaders/guided_s.glsl
@@ -46,8 +46,8 @@ vec4 hook()
 //!HOOK RGB
 //!DESC Guided filter (MEANIP)
 //!BIND IP
-//!WIDTH IP.w 2.0 /
-//!HEIGHT IP.h 2.0 /
+//!WIDTH IP.w 1.5 /
+//!HEIGHT IP.h 1.5 /
 //!SAVE MEANIP
 
 vec4 hook()
@@ -93,7 +93,7 @@ vec4 hook()
 //!HEIGHT IP.h
 //!SAVE A
 
-#define E 0.001
+#define E 0.002
 
 vec4 hook()
 {
diff --git a/portable_config/shaders/nlmeans.glsl b/portable_config/shaders/nlmeans.glsl
index d2e119d0..cecbc09a 100644
--- a/portable_config/shaders/nlmeans.glsl
+++ b/portable_config/shaders/nlmeans.glsl
@@ -82,11 +82,10 @@
  *   - Reflections may have a significant speed impact
  *
  * Options which always disable textureGather:
- * 	- RF
  * 	- PD
  */
 
-// The following is shader code injected from guided_s.glsl
+// The following is shader code injected from guided.glsl
 /* vi: ft=c
  *
  * Copyright (c) 2022 an3223 <ethanr2048@gmail.com>
@@ -105,25 +104,53 @@
  * along with this program. If not, see <https://www.gnu.org/licenses/>.
  */
 
-//desc: "Self-guided" guided filter
+//desc: Guided filter guided by the downscaled image
 
-/* The radius can be adjusted with the MEANIP stage's downscaling factor. 
+/* The radius can be adjusted with the MEANI stage's downscaling factor. 
  * Higher numbers give a bigger radius.
  *
  * The E variable can be found in the A stage.
  *
- * The subsampling (fast guided filter) can be adjusted with the IP stage's 
+ * The subsampling (fast guided filter) can be adjusted with the I stage's 
  * downscaling factor. Higher numbers are faster.
+ *
+ * The guide's subsampling can be adjusted with the PREI stage's downscaling 
+ * factor. Higher numbers downscale more.
  */
 
 //!HOOK LUMA
 //!HOOK CHROMA
-//!HOOK RGB
-//!DESC Guided filter (IP)
+//!DESC Guided filter (PREI)
 //!BIND HOOKED
+//!WIDTH HOOKED.w 1.25 /
+//!HEIGHT HOOKED.h 1.25 /
+//!SAVE _INJ_PREI
+
+vec4 hook()
+{
+	 return HOOKED_texOff(0); 
+}
+
+//!HOOK LUMA
+//!HOOK CHROMA
+//!DESC Guided filter (I)
+//!BIND _INJ_PREI
 //!WIDTH HOOKED.w 1.0 /
 //!HEIGHT HOOKED.h 1.0 /
-//!SAVE _INJ_IP
+//!SAVE _INJ_I
+
+vec4 hook()
+{
+return _INJ_PREI_texOff(0);
+}
+
+//!HOOK LUMA
+//!HOOK CHROMA
+//!DESC Guided filter (P)
+//!BIND HOOKED
+//!WIDTH _INJ_I.w
+//!HEIGHT _INJ_I.h
+//!SAVE _INJ_P
 
 vec4 hook()
 {
@@ -132,87 +159,124 @@ vec4 hook()
 
 //!HOOK LUMA
 //!HOOK CHROMA
-//!HOOK RGB
-//!DESC Guided filter (MEANIP)
-//!BIND _INJ_IP
-//!WIDTH _INJ_IP.w 2.0 /
-//!HEIGHT _INJ_IP.h 2.0 /
-//!SAVE _INJ_MEANIP
+//!DESC Guided filter (MEANI)
+//!BIND _INJ_I
+//!WIDTH _INJ_I.w 1.5 /
+//!HEIGHT _INJ_I.h 1.5 /
+//!SAVE _INJ_MEANI
 
 vec4 hook()
 {
-return _INJ_IP_texOff(0);
+return _INJ_I_texOff(0);
 }
 
 //!HOOK LUMA
 //!HOOK CHROMA
-//!HOOK RGB
-//!DESC Guided filter (_INJ_IP_SQ)
-//!BIND _INJ_IP
-//!WIDTH _INJ_IP.w
-//!HEIGHT _INJ_IP.h
-//!SAVE _INJ_IP_SQ
+//!DESC Guided filter (MEANP)
+//!BIND _INJ_P
+//!WIDTH _INJ_MEANI.w
+//!HEIGHT _INJ_MEANI.h
+//!SAVE _INJ_MEANP
 
 vec4 hook()
 {
-return _INJ_IP_texOff(0) * _INJ_IP_texOff(0);
+return _INJ_P_texOff(0);
 }
 
 //!HOOK LUMA
 //!HOOK CHROMA
-//!HOOK RGB
-//!DESC Guided filter (CORRIP)
-//!BIND _INJ_IP_SQ
-//!WIDTH _INJ_MEANIP.w
-//!HEIGHT _INJ_MEANIP.h
-//!SAVE _INJ_CORRIP
+//!DESC Guided filter (_INJ_I_SQ)
+//!BIND _INJ_I
+//!WIDTH _INJ_I.w
+//!HEIGHT _INJ_I.h
+//!SAVE _INJ_I_SQ
 
 vec4 hook()
 {
-return _INJ_IP_SQ_texOff(0);
+return _INJ_I_texOff(0) * _INJ_I_texOff(0);
+}
+
+//!HOOK LUMA
+//!HOOK CHROMA
+//!DESC Guided filter (_INJ_IXP)
+//!BIND _INJ_I
+//!BIND _INJ_P
+//!WIDTH _INJ_I.w
+//!HEIGHT _INJ_I.h
+//!SAVE _INJ_IXP
+
+vec4 hook()
+{
+return _INJ_I_texOff(0) * _INJ_P_texOff(0);
+}
+
+//!HOOK LUMA
+//!HOOK CHROMA
+//!DESC Guided filter (CORRI)
+//!BIND _INJ_I_SQ
+//!WIDTH _INJ_MEANI.w
+//!HEIGHT _INJ_MEANI.h
+//!SAVE _INJ_CORRI
+
+vec4 hook()
+{
+return _INJ_I_SQ_texOff(0);
+}
+
+//!HOOK LUMA
+//!HOOK CHROMA
+//!DESC Guided filter (CORRP)
+//!BIND _INJ_IXP
+//!WIDTH _INJ_MEANI.w
+//!HEIGHT _INJ_MEANI.h
+//!SAVE _INJ_CORRP
+
+vec4 hook()
+{
+return _INJ_IXP_texOff(0);
 }
 
 //!HOOK LUMA
 //!HOOK CHROMA
-//!HOOK RGB
 //!DESC Guided filter (A)
-//!BIND _INJ_MEANIP
-//!BIND _INJ_CORRIP
-//!WIDTH _INJ_IP.w
-//!HEIGHT _INJ_IP.h
+//!BIND _INJ_MEANI
+//!BIND _INJ_MEANP
+//!BIND _INJ_CORRI
+//!BIND _INJ_CORRP
+//!WIDTH _INJ_I.w
+//!HEIGHT _INJ_I.h
 //!SAVE _INJ_A
 
-#define E 0.001
+#define E 0.0013
 
 vec4 hook()
 {
-vec4 var = _INJ_CORRIP_texOff(0) - _INJ_MEANIP_texOff(0) * _INJ_MEANIP_texOff(0);
-	 vec4 cov = var; 
+vec4 var = _INJ_CORRI_texOff(0) - _INJ_MEANI_texOff(0) * _INJ_MEANI_texOff(0);
+vec4 cov = _INJ_CORRP_texOff(0) - _INJ_MEANI_texOff(0) * _INJ_MEANP_texOff(0);
 	 return cov / (var + E); 
 }
 
 //!HOOK LUMA
 //!HOOK CHROMA
-//!HOOK RGB
 //!DESC Guided filter (B)
 //!BIND _INJ_A
-//!BIND _INJ_MEANIP
-//!WIDTH _INJ_IP.w
-//!HEIGHT _INJ_IP.h
+//!BIND _INJ_MEANI
+//!BIND _INJ_MEANP
+//!WIDTH _INJ_I.w
+//!HEIGHT _INJ_I.h
 //!SAVE _INJ_B
 
 vec4 hook()
 {
-return _INJ_MEANIP_texOff(0) - _INJ_A_texOff(0) * _INJ_MEANIP_texOff(0);
+return _INJ_MEANP_texOff(0) - _INJ_A_texOff(0) * _INJ_MEANI_texOff(0);
 }
 
 //!HOOK LUMA
 //!HOOK CHROMA
-//!HOOK RGB
 //!DESC Guided filter (MEANA)
 //!BIND _INJ_A
-//!WIDTH _INJ_MEANIP.w
-//!HEIGHT _INJ_MEANIP.h
+//!WIDTH _INJ_MEANI.w
+//!HEIGHT _INJ_MEANI.h
 //!SAVE _INJ_MEANA
 
 vec4 hook()
@@ -222,11 +286,10 @@ return _INJ_A_texOff(0);
 
 //!HOOK LUMA
 //!HOOK CHROMA
-//!HOOK RGB
 //!DESC Guided filter (MEANB)
 //!BIND _INJ_B
-//!WIDTH _INJ_MEANIP.w
-//!HEIGHT _INJ_MEANIP.h
+//!WIDTH _INJ_MEANI.w
+//!HEIGHT _INJ_MEANI.h
 //!SAVE _INJ_MEANB
 
 vec4 hook()
@@ -236,7 +299,6 @@ return _INJ_B_texOff(0);
 
 //!HOOK LUMA
 //!HOOK CHROMA
-//!HOOK RGB
 //!DESC Guided filter
 //!BIND HOOKED
 //!BIND _INJ_MEANA
@@ -248,10 +310,9 @@ vec4 hook()
 return _INJ_MEANA_texOff(0) * HOOKED_texOff(0) + _INJ_MEANB_texOff(0);
 }
 
-// End of source code injected from guided_s.glsl
+// End of source code injected from guided.glsl
 //!HOOK LUMA
 //!HOOK CHROMA
-//!HOOK RGB
 //!DESC Non-local means (downscale)
 //!WIDTH LUMA.w 3 /
 //!HEIGHT LUMA.h 3 /
@@ -265,7 +326,6 @@ vec4 hook()
 
 //!HOOK LUMA
 //!HOOK CHROMA
-//!HOOK RGB
 //!DESC Non-local means (share)
 //!BIND RF_LUMA
 //!SAVE RF
@@ -277,7 +337,6 @@ vec4 hook()
 
 //!HOOK LUMA
 //!HOOK CHROMA
-//!HOOK RGB
 //!BIND HOOKED
 //!BIND RF_LUMA
 //!BIND EP
@@ -304,11 +363,10 @@ vec4 hook()
  *
  * Even-numbered patch/research sizes will sample between pixels unless PS=6. 
  * It's not known whether this is ever useful behavior or not. This is 
- * incompatible with textureGather optimizations, so enable RF when using even 
- * patch/research sizes.
+ * incompatible with textureGather optimizations, so NG=1 to disable them.
  */
 #ifdef LUMA_raw
-#define S 2.25
+#define S 20.0
 #define P 3
 #define R 5
 #else
@@ -319,24 +377,37 @@ vec4 hook()
 
 /* Adaptive sharpening
  *
- * Uses the blur incurred by denoising plus the weight map to perform an 
- * unsharp mask that gets applied most strongly to edges.
+ * Uses the blur incurred by denoising to perform an unsharp mask, and uses the 
+ * weight map to restrict the sharpening to edges.
  *
- * Sharpening will amplify noise, so the denoising factor (S) should usually be 
- * increased to compensate.
+ * Use M=4 to get a good look at which areas are/aren't sharpened.
  *
  * AS: 2 for sharpening, 1 for sharpening+denoising, 0 to disable
  * ASF: Sharpening factor, higher numbers make a sharper underlying image
  * ASP: Weight power, higher numbers use more of the sharp image
+ * ASW:
+ * 	- 0 to use pre-WD weights
+ * 	- 1 to use post-WD weights (ASP should be ~2x to compensate)
+ * ASK: Weight kernel:
+ * 	- 0 for power. This is the old method.
+ * 	- 1 for sigmoid. This is generally recommended.
+ * 	- 2 for constant (non-adaptive, w/ ASP=0 this sharpens the entire image)
+ * ASC (only for ASK=1, range 0-1): Reduces the contrast of the edge map
  */
 #ifdef LUMA_raw
 #define AS 0
-#define ASF 1.0
-#define ASP 2.0
+#define ASF 2.0
+#define ASP 32.0
+#define ASW 0
+#define ASK 1
+#define ASC 0.0
 #else
 #define AS 0
-#define ASF 1.0
-#define ASP 2.0
+#define ASF 2.0
+#define ASP 32.0
+#define ASW 0
+#define ASK 1
+#define ASC 0.0
 #endif
 
 /* Starting weight
@@ -345,8 +416,8 @@ vec4 hook()
  * handle higher noise levels, ringing, and may be useful for other things too?
  *
  * EPSILON should be used instead of zero to avoid divide-by-zero errors. The 
- * avg_weight variable may be used to make SW adapt to the local noise level, 
- * e.g., SW=max(avg_weight, EPSILON)
+ * avg_weight/old_avg_weight variables may be used to make SW adapt to the 
+ * local noise level, e.g., SW=max(avg_weight, EPSILON)
  */
 #ifdef LUMA_raw
 #define SW 1.0
@@ -366,7 +437,7 @@ vec4 hook()
  * 	- 0: Disable
  *
  * WDT: Threshold coefficient, higher numbers discard more
- * WDP (WD=1): Higher numbers reduce the threshold more for small sample sizes
+ * WDP (only for WD=1): Increasing reduces the threshold for small sample sizes
  */
 #ifdef LUMA_raw
 #define WD 2
@@ -378,6 +449,49 @@ vec4 hook()
 #define WDP 6.0
 #endif
 
+/* Extremes preserve
+ *
+ * Reduces denoising around very bright/dark areas. The downscaling factor of 
+ * EP (located near the top of this shader) controls the area sampled for 
+ * luminance (higher numbers consider more area).
+ *
+ * This is incompatible with RGB. If you have RGB hooks enabled then you will 
+ * have to delete the EP shader stage or specify EP=0 through nlmeans_cfg.
+ *
+ * EP: 1 to enable, 0 to disable
+ * DP: EP strength on dark patches, 0 to fully denoise
+ * BP: EP strength on bright patches, 0 to fully denoise
+ */
+#ifdef LUMA_raw
+#define EP 1
+#define BP 0.75
+#define DP 0.25
+#else
+#define EP 0
+#define BP 0.0
+#define DP 0.0
+#endif
+
+/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
+/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
+/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
+/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
+/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
+
+/* Robust filtering
+ *
+ * This setting is dependent on code generation from nlmeans_cfg, so this 
+ * setting can only be enabled via nlmeans_cfg.
+ *
+ * Compares the pixel-of-interest against a guide, which could be a downscaled 
+ * image or the output of another shader such as guided.glsl
+ */
+#ifdef LUMA_raw
+#define RF 1
+#else
+#define RF 1
+#endif
+
 /* Search shape
  *
  * Determines the shape of patches and research zones. Different shapes have 
@@ -386,6 +500,8 @@ vec4 hook()
  *
  * PS applies applies to patches, RS applies to research zones.
  *
+ * Be wary of gather optimizations (see the Regarding Speed comment at the top)
+ *
  * 0: square (symmetrical)
  * 1: horizontal line (asymmetric)
  * 2: vertical line (asymmetric)
@@ -432,8 +548,9 @@ vec4 hook()
  * 	- Buggy
  *
  * Gather samples across multiple frames. May cause motion blur and may 
- * struggle more with noise that persists across multiple frames (compression 
- * noise, repeating frames), but can work very well on high quality video.
+ * struggle more with noise that persists across multiple frames (e.g., from 
+ * compression or duplicate frames), but can work very well on high quality 
+ * video.
  *
  * Motion estimation (ME) should improve quality without impacting speed.
  *
@@ -468,60 +585,51 @@ vec4 hook()
  */
 #ifdef LUMA_raw
 #define SS 0.25
-#define SD vec3(1,1,1)
+#define SD vec3(1,1,1.5)
 #define PST 0
 #define PSS 0.0
 #define PSD vec2(1,1)
 #else
 #define SS 0.25
-#define SD vec3(1,1,1)
+#define SD vec3(1,1,1.5)
 #define PST 0
 #define PSS 0.0
 #define PSD vec2(1,1)
 #endif
 
-/* Extremes preserve
- *
- * Reduces denoising around very bright/dark areas. The downscaling factor of 
- * EP (located near the top of this shader) controls the area sampled for 
- * luminance (higher numbers consider more area).
+// Scaling factor (should match WIDTH/HEIGHT)
+#ifdef LUMA_raw
+#define SF 1
+#else
+#define SF 1
+#endif
+
+/* Estimator
  *
- * EP: 1 to enable, 0 to disable
- * DP: EP strength on dark patches, 0 to fully denoise
- * BP: EP strength on bright patches, 0 to fully denoise
+ * 0: means
+ * 1: Euclidean medians (extremely slow, may be good for heavy noise)
+ * 2: weight map (not a denoiser, maybe useful for generating image masks)
+ * 3: weighted median intensity (slow, may be good for heavy noise)
+ * 4: edge map (based on the relevant AS settings)
  */
 #ifdef LUMA_raw
-#define EP 1
-#define BP 0.75
-#define DP 0.25
+#define M 0
 #else
-#define EP 0
-#define BP 0.0
-#define DP 0.0
+#define M 0
 #endif
 
-/* Robust filtering
- *
- * This setting is dependent on code generation from nlmeans_cfg, so this 
- * setting can only be enabled via nlmeans_cfg.
+/* Difference visualization
  *
- * Compares the pixel-of-interest against downscaled pixels.
+ * Visualizes the difference between input/output image
  *
- * This will virtually always improve quality, but will always disable 
- * textureGather optimizations.
- *
- * The downscale factor can be modified in the WIDTH/HEIGHT directives for the 
- * RF texture (for CHROMA, RGB) and RF_LUMA (LUMA only) textures near the top 
- * of this shader, higher numbers increase blur.
- *
- * Any notation of RF as a positive number should be assumed to be referring to 
- * the downscaling factor, e.g., RF=3 means RF is set to 1 and the downscaling 
- * factor is set to 3.
+ * 0: off
+ * 1: absolute difference scaled by S
+ * 2: difference centered on 0.5
  */
 #ifdef LUMA_raw
-#define RF 1
+#define DV 0
 #else
-#define RF 1
+#define DV 0
 #endif
 
 /* Blur factor
@@ -535,30 +643,11 @@ vec4 hook()
 #define BF 1.0
 #endif
 
-/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
-/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
-/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
-/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
-/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
-
-// Scaling factor (should match WIDTH/HEIGHT)
-#ifdef LUMA_raw
-#define SF 1
-#else
-#define SF 1
-#endif
-
-/* Estimator
- *
- * 0: means
- * 1: Euclidean medians (extremely slow, may be good for heavy noise)
- * 2: weight map (not a denoiser, maybe useful for generating image masks)
- * 3: weighted median intensity (slow, may be good for heavy noise)
- */
+// Force disable textureGather
 #ifdef LUMA_raw
-#define M 0
+#define NG 0
 #else
-#define M 0
+#define NG 0
 #endif
 
 // Patch donut (probably useless)
@@ -568,7 +657,7 @@ vec4 hook()
 #define PD 0
 #endif
 
-// Duplicate 1st weight
+// Duplicate 1st weight (for LGC)
 #ifdef LUMA_raw
 #define D1W 0
 #else
@@ -578,6 +667,7 @@ vec4 hook()
 /* Shader code */
 
 #define EPSILON 0.00000000001
+#define M_PI 3.14159265358979323846
 
 #if PS == 6
 const int hp = P/2;
@@ -627,6 +717,7 @@ const float hr = int(R/2) - 0.5*(1-(R%2)); // sample between pixels for even res
 #define R_AREA(a) (a * T1 + RF-1)
 
 // research shapes
+// XXX would be nice to have the option of temporally-varying research sizes
 #if R == 0 || R == 1
 #define FOR_RESEARCH(r) S_1X1(r)
 const int r_area = R_AREA(1);
@@ -798,11 +889,13 @@ vec4 patch_comparison(vec3 r, vec3 r2)
 	return min_rot * p_scale;
 }
 
-#define NO_GATHER (PD == 0) // never textureGather if any of these conditions are false
+#define NO_GATHER (PD == 0 && NG == 0) // never textureGather if any of these conditions are false
 #define REGULAR_ROTATIONS (RI == 0 || RI == 1 || RI == 3)
 
 #if (defined(LUMA_gather) || D1W) && ((PS == 3 || PS == 7) && P == 3) && PST == 0 && M != 1 && REGULAR_ROTATIONS && NO_GATHER
 // 3x3 diamond/plus patch_comparison_gather
+// XXX extend to support arbitrary sizes (probably requires code generation)
+// XXX extend to support 3x3 square
 const ivec2 offsets[4] = { ivec2(0,-1), ivec2(-1,0), ivec2(0,1), ivec2(1,0) };
 const ivec2 offsets_sf[4] = { ivec2(0,-1) * SF, ivec2(-1,0) * SF, ivec2(0,1) * SF, ivec2(1,0) * SF };
 vec4 poi_patch = gather_offs(0, offsets);
@@ -832,6 +925,7 @@ vec4 patch_comparison_gather(vec3 r, vec3 r2)
 }
 #elif (defined(LUMA_gather) || D1W) && PS == 6 && REGULAR_ROTATIONS && NO_GATHER
 // tiled even square patch_comparison_gather
+// XXX extend to support odd square?
 vec4 patch_comparison_gather(vec3 r, vec3 r2)
 {
 	vec2 tile;
@@ -912,6 +1006,7 @@ vec4 hook()
 #endif
 
 	FOR_FRAME(r) {
+	// XXX ME is always a frame behind, should have to option to re-research after applying ME (could do it an arbitrary number of times per frame if desired)
 #if T && ME == 1 // temporal & motion estimation max weight
 	if (r.z > 0) {
 		me += me_tmp;
@@ -982,6 +1077,7 @@ vec4 hook()
 	} // FOR_RESEARCH
 	} // FOR_FRAME
 
+	// XXX optionally put the denoised pixel into the frame buffer?
 #if T // temporal
 #endif
 
@@ -1033,12 +1129,29 @@ vec4 hook()
 	result = sum / total_weight;
 #endif
 
+#if ASW == 0 // pre-WD weights
+#define AS_weight old_avg_weight
+#elif ASW == 1 // post-WD weights
+#define AS_weight avg_weight
+#endif
+
+#if ASK == 0
+	vec4 sharpening_strength = pow(AS_weight, vec4(ASP));
+#elif ASK == 1
+#define sigmoid(x) (tanh(x * 2*M_PI - M_PI)*0.5+0.5)
+	vec4 sharpening_strength = mix(pow(sigmoid(AS_weight), vec4(ASP)),
+	                               AS_weight, ASC);
+	// just in case ASC < 0 (will sharpen but it's janky XXX)
+	sharpening_strength = clamp(sharpening_strength, 0.0, 1.0);
+#elif ASK == 2
+	vec4 sharpening_strength = vec4(ASP);
+#endif
+
+	// XXX maybe allow for alternative blurs? e.g., replace result w/ load2?
 #if AS == 1 // sharpen+denoise
 	vec4 sharpened = result + (poi - result) * ASF;
-	vec4 sharpening_power = pow(avg_weight, vec4(ASP));
 #elif AS == 2 // sharpen only
 	vec4 sharpened = poi + (poi - result) * ASF;
-	vec4 sharpening_power = pow(avg_weight, vec4(ASP));
 #endif
 
 #if EP // extremes preserve
@@ -1049,9 +1162,23 @@ vec4 hook()
 #endif
 
 #if AS == 1 // sharpen+denoise
-	result = mix(sharpened, result, sharpening_power);
+	result = mix(sharpened, result, sharpening_strength);
 #elif AS == 2 // sharpen only
-	result = mix(sharpened, poi, sharpening_power);
+	result = mix(sharpened, poi, sharpening_strength);
+#endif
+
+#if M == 4 // edge map
+	result = sharpening_strength;
+#endif
+
+#if (M == 2 || M == 4) && defined(CHROMA_raw) // drop chroma for weight maps
+	result = vec4(0.5);
+#endif
+
+#if DV == 1
+	result = clamp(abs(poi - result) * S, 0.0, 1.0);
+#elif DV == 2
+	result = (poi - result) * 0.5 + 0.5;
 #endif
 
 	return mix(poi, result, BF);
diff --git a/portable_config/shaders/nlmeans_anime.glsl b/portable_config/shaders/nlmeans_anime.glsl
deleted file mode 100644
index 51d1327c..00000000
--- a/portable_config/shaders/nlmeans_anime.glsl
+++ /dev/null
@@ -1,1044 +0,0 @@
-/* vi: ft=c
- *
- * Based on vf_nlmeans.c from FFmpeg.
- *
- * Copyright (c) 2022 an3223 <ethanr2048@gmail.com>
- * Copyright (c) 2016 Clément Bœsch <u pkh me>
- *
- * This program is free software: you can redistribute it and/or modify it 
- * under the terms of the GNU Lesser General Public License as published by 
- * the Free Software Foundation, either version 2.1 of the License, or (at 
- * your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT 
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 
- * for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License 
- * along with this program. If not, see <https://www.gnu.org/licenses/>.
- */
-
-// Profile description: Tuned for anime/cartoons, may be useful for other content.
-
-/* The recommended usage of this shader and its variant profiles is to add them 
- * to input.conf and then dispatch the appropriate shader via a keybind during 
- * media playback. Here is an example input.conf entry:
- *
- * F4 no-osd change-list glsl-shaders toggle "~~/shaders/nlmeans_luma.glsl"; show-text "Non-local means (LUMA only)"
- *
- * These shaders can also be enabled by default in mpv.conf, for example:
- *
- * glsl-shaders='~~/shaders/nlmeans.glsl'
- *
- * Both of the examples above assume the shaders are located in a subdirectory 
- * named "shaders" within mpv's config directory. Refer to the mpv 
- * documentation for more details.
- *
- * This shader is highly configurable via user variables below. Although the 
- * default settings should offer good quality at a reasonable speed, you are 
- * encouraged to tweak them to your preferences. Be mindful that certain 
- * settings may greatly affect speed.
- *
- * Denoising is most useful for noisy content. If there is no perceptible 
- * noise, you probably won't see a positive difference.
- *
- * The default settings are generally tuned for low noise and high detail 
- * preservation. The "medium" and "heavy" profiles are tuned for higher levels 
- * of noise.
- *
- * The denoiser will not work properly if the content has been upscaled 
- * beforehand, whether it was done by you or someone down the line. Consider 
- * issuing a command to downscale in the mpv console, like so:
- *
- * vf toggle scale=-2:720
- *
- * ...replacing 720 with whatever resolution seems appropriate. Rerun the 
- * command to undo the downscale. It may take some trial-and-error to find the 
- * proper resolution.
- */
-
-/* Regarding speed
- *
- * Speed may vary wildly for different vo and gpu-api settings. Generally 
- * vo=gpu-next and gpu-api=vulkan are recommended for the best speed, but this 
- * may be different for your system.
- *
- * If your GPU doesn't support textureGather, or if you are on a version of mpv 
- * prior to 0.35.0, then consider setting RI/RFI to 0, or try the LQ and VLQ 
- * profiles.
- *
- * textureGather is LUMA only and limited to the following configurations:
- *
- * - PS={3,7}:P=3:PST=0:RI={0,1,3}:RFI={0,1,2}:M!=1
- *   - Default, very fast, rotations and reflections should be free
- *   - If this is unusually slow then try changing gpu-api and vo
- *   - If it's still slow, try setting RI/RFI to 0.
- *
- * - PS=6:RI={0,1,3}:RFI={0,1,2}
- *   - Currently the only scalable variant
- *   - Patch shape is asymmetric on two axis
- *   - Rotations should have very little speed impact
- *   - Reflections may have a significant speed impact
- *
- * Options which always disable textureGather:
- * 	- RF
- * 	- PD
- */
-
-// The following is shader code injected from guided_s.glsl
-/* vi: ft=c
- *
- * Copyright (c) 2022 an3223 <ethanr2048@gmail.com>
- *
- * This program is free software: you can redistribute it and/or modify it 
- * under the terms of the GNU Lesser General Public License as published by 
- * the Free Software Foundation, either version 2.1 of the License, or (at 
- * your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT 
- * ANY WARRANTY;  without even the implied warranty of MERCHANTABILITY or 
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 
- * for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License 
- * along with this program. If not, see <https://www.gnu.org/licenses/>.
- */
-
-//desc: "Self-guided" guided filter
-
-/* The radius can be adjusted with the MEANIP stage's downscaling factor. 
- * Higher numbers give a bigger radius.
- *
- * The E variable can be found in the A stage.
- *
- * The subsampling (fast guided filter) can be adjusted with the IP stage's 
- * downscaling factor. Higher numbers are faster.
- */
-
-//!HOOK LUMA
-//!HOOK CHROMA
-//!HOOK RGB
-//!DESC Guided filter (IP)
-//!BIND HOOKED
-//!WIDTH HOOKED.w 1.0 /
-//!HEIGHT HOOKED.h 1.0 /
-//!SAVE _INJ_IP
-
-vec4 hook()
-{
-	 return HOOKED_texOff(0); 
-}
-
-//!HOOK LUMA
-//!HOOK CHROMA
-//!HOOK RGB
-//!DESC Guided filter (MEANIP)
-//!BIND _INJ_IP
-//!WIDTH _INJ_IP.w 2.0 /
-//!HEIGHT _INJ_IP.h 2.0 /
-//!SAVE _INJ_MEANIP
-
-vec4 hook()
-{
-return _INJ_IP_texOff(0);
-}
-
-//!HOOK LUMA
-//!HOOK CHROMA
-//!HOOK RGB
-//!DESC Guided filter (_INJ_IP_SQ)
-//!BIND _INJ_IP
-//!WIDTH _INJ_IP.w
-//!HEIGHT _INJ_IP.h
-//!SAVE _INJ_IP_SQ
-
-vec4 hook()
-{
-return _INJ_IP_texOff(0) * _INJ_IP_texOff(0);
-}
-
-//!HOOK LUMA
-//!HOOK CHROMA
-//!HOOK RGB
-//!DESC Guided filter (CORRIP)
-//!BIND _INJ_IP_SQ
-//!WIDTH _INJ_MEANIP.w
-//!HEIGHT _INJ_MEANIP.h
-//!SAVE _INJ_CORRIP
-
-vec4 hook()
-{
-return _INJ_IP_SQ_texOff(0);
-}
-
-//!HOOK LUMA
-//!HOOK CHROMA
-//!HOOK RGB
-//!DESC Guided filter (A)
-//!BIND _INJ_MEANIP
-//!BIND _INJ_CORRIP
-//!WIDTH _INJ_IP.w
-//!HEIGHT _INJ_IP.h
-//!SAVE _INJ_A
-
-#define E 0.001
-
-vec4 hook()
-{
-vec4 var = _INJ_CORRIP_texOff(0) - _INJ_MEANIP_texOff(0) * _INJ_MEANIP_texOff(0);
-	 vec4 cov = var; 
-	 return cov / (var + E); 
-}
-
-//!HOOK LUMA
-//!HOOK CHROMA
-//!HOOK RGB
-//!DESC Guided filter (B)
-//!BIND _INJ_A
-//!BIND _INJ_MEANIP
-//!WIDTH _INJ_IP.w
-//!HEIGHT _INJ_IP.h
-//!SAVE _INJ_B
-
-vec4 hook()
-{
-return _INJ_MEANIP_texOff(0) - _INJ_A_texOff(0) * _INJ_MEANIP_texOff(0);
-}
-
-//!HOOK LUMA
-//!HOOK CHROMA
-//!HOOK RGB
-//!DESC Guided filter (MEANA)
-//!BIND _INJ_A
-//!WIDTH _INJ_MEANIP.w
-//!HEIGHT _INJ_MEANIP.h
-//!SAVE _INJ_MEANA
-
-vec4 hook()
-{
-return _INJ_A_texOff(0);
-}
-
-//!HOOK LUMA
-//!HOOK CHROMA
-//!HOOK RGB
-//!DESC Guided filter (MEANB)
-//!BIND _INJ_B
-//!WIDTH _INJ_MEANIP.w
-//!HEIGHT _INJ_MEANIP.h
-//!SAVE _INJ_MEANB
-
-vec4 hook()
-{
-return _INJ_B_texOff(0);
-}
-
-//!HOOK LUMA
-//!HOOK CHROMA
-//!HOOK RGB
-//!DESC Guided filter
-//!BIND HOOKED
-//!BIND _INJ_MEANA
-//!BIND _INJ_MEANB
-//!SAVE RF_LUMA
-
-vec4 hook()
-{
-return _INJ_MEANA_texOff(0) * HOOKED_texOff(0) + _INJ_MEANB_texOff(0);
-}
-
-// End of source code injected from guided_s.glsl
-//!HOOK LUMA
-//!HOOK CHROMA
-//!HOOK RGB
-//!DESC Non-local means (share)
-//!BIND RF_LUMA
-//!SAVE RF
-
-vec4 hook()
-{
-	return RF_LUMA_texOff(0);
-}
-
-//!HOOK LUMA
-//!HOOK CHROMA
-//!HOOK RGB
-//!BIND HOOKED
-//!BIND RF_LUMA
-//!BIND RF
-//!DESC Non-local means (nlmeans_anime.glsl)
-
-/* User variables
- *
- * It is usually preferable to denoise chroma and luma differently, so the user 
- * variables for luma and chroma are split.
- */
-
-/* S = denoising factor
- * P = patch size
- * R = research size
- *
- * The denoising factor controls the level of blur, higher is blurrier.
- *
- * Patch size should usually be an odd number greater than or equal to 3. 
- * Higher values are slower and not always better.
- *
- * Research size usually be an odd number greater than or equal to 3. Higher 
- * values are usually better, but slower and offer diminishing returns.
- *
- * Even-numbered patch/research sizes will sample between pixels unless PS=6. 
- * It's not known whether this is ever useful behavior or not. This is 
- * incompatible with textureGather optimizations, so enable RF when using even 
- * patch/research sizes.
- */
-#ifdef LUMA_raw
-#define S 3
-#define P 3
-#define R 5
-#else
-#define S 3
-#define P 3
-#define R 5
-#endif
-
-/* Adaptive sharpening
- *
- * Uses the blur incurred by denoising plus the weight map to perform an 
- * unsharp mask that gets applied most strongly to edges.
- *
- * Sharpening will amplify noise, so the denoising factor (S) should usually be 
- * increased to compensate.
- *
- * AS: 2 for sharpening, 1 for sharpening+denoising, 0 to disable
- * ASF: Sharpening factor, higher numbers make a sharper underlying image
- * ASP: Weight power, higher numbers use more of the sharp image
- */
-#ifdef LUMA_raw
-#define AS 0
-#define ASF 1.0
-#define ASP 2.0
-#else
-#define AS 0
-#define ASF 1.0
-#define ASP 2.0
-#endif
-
-/* Starting weight
- *
- * Lower numbers give less weight to the pixel-of-interest, which may help 
- * handle higher noise levels, ringing, and may be useful for other things too?
- *
- * EPSILON should be used instead of zero to avoid divide-by-zero errors. The 
- * avg_weight variable may be used to make SW adapt to the local noise level, 
- * e.g., SW=max(avg_weight, EPSILON)
- */
-#ifdef LUMA_raw
-#define SW 1.0
-#else
-#define SW 1.0
-#endif
-
-/* Weight discard
- *
- * Discard weights that fall below a fraction of the average weight. This culls 
- * the most dissimilar samples from the blur, yielding a much more pleasant 
- * result, especially around edges.
- * 
- * WD:
- * 	- 2: True average. Very good quality, but slower and uses more memory.
- * 	- 1: Moving cumulative average. Inaccurate, tends to blur directionally.
- * 	- 0: Disable
- *
- * WDT: Threshold coefficient, higher numbers discard more
- * WDP (WD=1): Higher numbers reduce the threshold more for small sample sizes
- */
-#ifdef LUMA_raw
-#define WD 2
-#define WDT 1.0
-#define WDP 6.0
-#else
-#define WD 2
-#define WDT 1.0
-#define WDP 6.0
-#endif
-
-/* Search shape
- *
- * Determines the shape of patches and research zones. Different shapes have 
- * different speed and quality characteristics. Every shape (besides square) is 
- * smaller than square.
- *
- * PS applies applies to patches, RS applies to research zones.
- *
- * 0: square (symmetrical)
- * 1: horizontal line (asymmetric)
- * 2: vertical line (asymmetric)
- * 3: diamond (symmetrical)
- * 4: triangle (asymmetric, pointing upward)
- * 5: truncated triangle (asymmetric on two axis, last row halved)
- * 6: even sized square (asymmetric on two axis)
- * 7: plus (symmetrical)
- */
-#ifdef LUMA_raw
-#define RS 3
-#define PS 3
-#else
-#define RS 3
-#define PS 3
-#endif
-
-/* Rotational/reflectional invariance
- *
- * Number of rotations/reflections to try for each patch comparison. Slow, but 
- * improves feature preservation, although adding more rotations/reflections 
- * gives diminishing returns. The most similar rotation/reflection will be used.
- *
- * The angle in degrees of each rotation is 360/(RI+1), so RI=1 will do a 
- * single 180 degree rotation, RI=3 will do three 90 degree rotations, etc.
- *
- * RI: Rotational invariance
- * RFI (0 to 2): Reflectional invariance
- */
-#ifdef LUMA_raw
-#define RI 3
-#define RFI 2
-#else
-#define RI 0
-#define RFI 0
-#endif
-
-/* Temporal denoising
- *
- * Caveats:
- * 	- Slower, each frame needs to be researched
- * 	- Requires vo=gpu-next and nlmeans_temporal.glsl
- * 	- Luma-only (this is a bug)
- * 	- Buggy
- *
- * Gather samples across multiple frames. May cause motion blur and may 
- * struggle more with noise that persists across multiple frames (compression 
- * noise, repeating frames), but can work very well on high quality video.
- *
- * Motion estimation (ME) should improve quality without impacting speed.
- *
- * T: number of frames used
- * ME: motion estimation, 0 for none, 1 for max weight, 2 for weighted avg
- */
-#ifdef LUMA_raw
-#define T 0
-#define ME 1
-#else
-#define T 0
-#define ME 0
-#endif
-
-/* Spatial kernel
- *
- * Increasing the spatial denoising factor (SS) reduces the weight of further 
- * pixels.
- *
- * Spatial distortion instructs the spatial kernel to view that axis as 
- * closer/further, for instance SD=(1,1,0.5) would make the temporal axis 
- * appear closer and increase blur between frames.
- *
- * The intra-patch variants do not yet have well-understood effects. They are 
- * intended to make large patch sizes more useful. Likely slower.
- *
- * SS: spatial denoising factor
- * SD: spatial distortion (X, Y, time)
- * PSS: intra-patch spatial denoising factor
- * PST: enables intra-patch spatial kernel if P>=PST, 0 fully disables
- * PSD: intra-patch spatial distortion (X, Y)
- */
-#ifdef LUMA_raw
-#define SS 0.0
-#define SD vec3(1,1,1)
-#define PST 0
-#define PSS 0.0
-#define PSD vec2(1,1)
-#else
-#define SS 0.0
-#define SD vec3(1,1,1)
-#define PST 0
-#define PSS 0.0
-#define PSD vec2(1,1)
-#endif
-
-/* Extremes preserve
- *
- * Reduces denoising around very bright/dark areas. The downscaling factor of 
- * EP (located near the top of this shader) controls the area sampled for 
- * luminance (higher numbers consider more area).
- *
- * EP: 1 to enable, 0 to disable
- * DP: EP strength on dark patches, 0 to fully denoise
- * BP: EP strength on bright patches, 0 to fully denoise
- */
-#ifdef LUMA_raw
-#define EP 0
-#define BP 0.75
-#define DP 0.25
-#else
-#define EP 0
-#define BP 0.0
-#define DP 0.0
-#endif
-
-/* Robust filtering
- *
- * This setting is dependent on code generation from nlmeans_cfg, so this 
- * setting can only be enabled via nlmeans_cfg.
- *
- * Compares the pixel-of-interest against downscaled pixels.
- *
- * This will virtually always improve quality, but will always disable 
- * textureGather optimizations.
- *
- * The downscale factor can be modified in the WIDTH/HEIGHT directives for the 
- * RF texture (for CHROMA, RGB) and RF_LUMA (LUMA only) textures near the top 
- * of this shader, higher numbers increase blur.
- *
- * Any notation of RF as a positive number should be assumed to be referring to 
- * the downscaling factor, e.g., RF=3 means RF is set to 1 and the downscaling 
- * factor is set to 3.
- */
-#ifdef LUMA_raw
-#define RF 1
-#else
-#define RF 1
-#endif
-
-/* Blur factor
- *
- * 0 to 1, only useful for alternative estimators. You're probably looking for 
- * "S" (denoising factor), go back to the top of the shader!
- */
-#ifdef LUMA_raw
-#define BF 1.0
-#else
-#define BF 1.0
-#endif
-
-/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
-/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
-/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
-/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
-/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
-
-// Scaling factor (should match WIDTH/HEIGHT)
-#ifdef LUMA_raw
-#define SF 1
-#else
-#define SF 1
-#endif
-
-/* Estimator
- *
- * 0: means
- * 1: Euclidean medians (extremely slow, may be good for heavy noise)
- * 2: weight map (not a denoiser, maybe useful for generating image masks)
- * 3: weighted median intensity (slow, may be good for heavy noise)
- */
-#ifdef LUMA_raw
-#define M 0
-#else
-#define M 0
-#endif
-
-// Patch donut (probably useless)
-#ifdef LUMA_raw
-#define PD 0
-#else
-#define PD 0
-#endif
-
-// Duplicate 1st weight
-#ifdef LUMA_raw
-#define D1W 0
-#else
-#define D1W 0
-#endif
-
-/* Shader code */
-
-#define EPSILON 0.00000000001
-
-#if PS == 6
-const int hp = P/2;
-#else
-const float hp = int(P/2) - 0.5*(1-(P%2)); // sample between pixels for even patch sizes
-#endif
-
-#if RS == 6
-const int hr = R/2;
-#else
-const float hr = int(R/2) - 0.5*(1-(R%2)); // sample between pixels for even research sizes
-#endif
-
-// donut increment, increments without landing on (0,0,0)
-// much faster than a "continue" statement
-#define DINCR(z,c) (z.c++,(z.c += int(z == vec3(0))))
-
-// search shapes and their corresponding areas
-#define S_1X1(z) for (z = vec3(0); z.x <= 0; z.x++)
-
-#define S_TRIANGLE(z,hz,incr) for (z.y = -hz; z.y <= 0; z.y++) for (z.x = -abs(abs(z.y) - hz); z.x <= abs(abs(z.y) - hz); incr)
-#define S_TRUNC_TRIANGLE(z,hz,incr) for (z.y = -hz; z.y <= 0; z.y++) for (z.x = -abs(abs(z.y) - hz); z.x <= abs(abs(z.y) - hz)*int(z.y!=0); incr)
-#define S_TRIANGLE_A(hz,Z) int(pow(hz, 2)+Z)
-
-#define S_DIAMOND(z,hz,incr) for (z.x = -hz; z.x <= hz; z.x++) for (z.y = -abs(abs(z.x) - hz); z.y <= abs(abs(z.x) - hz); incr)
-#define S_DIAMOND_A(hz,Z) int(pow(hz, 2)*2+Z)
-
-#define S_VERTICAL(z,hz,incr) for (z.x = 0; z.x <= 0; z.x++) for (z.y = -hz; z.y <= hz; incr)
-#define S_HORIZONTAL(z,hz,incr) for (z.x = -hz; z.x <= hz; incr) for (z.y = 0; z.y <= 0; z.y++)
-
-#define S_PLUS(z,hz,incr) for (z.x = -hz; z.x <= hz; z.x++) for (z.y = -hz * int(z.x == 0); z.y <= hz * int(z.x == 0); incr)
-#define S_PLUS_A(hz,Z) (Z*2 - 1)
-
-#define S_SQUARE(z,hz,incr) for (z.x = -hz; z.x <= hz; z.x++) for (z.y = -hz; z.y <= hz; incr)
-#define S_SQUARE_EVEN(z,hz,incr) for (z.x = -hz; z.x < hz; z.x++) for (z.y = -hz; z.y < hz; incr)
-
-#define T1 (T+1)
-#define FOR_FRAME(r) for (r.z = 0; r.z < T1; r.z++)
-
-// Skip comparing the pixel-of-interest against itself, unless RF is enabled
-#if RF
-#define RINCR(z,c) (z.c++)
-#else
-#define RINCR DINCR
-#endif
-
-#define R_AREA(a) (a * T1 + RF-1)
-
-// research shapes
-#if R == 0 || R == 1
-#define FOR_RESEARCH(r) S_1X1(r)
-const int r_area = R_AREA(1);
-#elif RS == 7
-#define FOR_RESEARCH(r) S_PLUS(r,hr,RINCR(r,y))
-const int r_area = R_AREA(S_PLUS_A(hr,R));
-#elif RS == 6
-#define FOR_RESEARCH(r) S_SQUARE_EVEN(r,hr,RINCR(r,y))
-const int r_area = R_AREA(R*R);
-#elif RS == 5
-#define FOR_RESEARCH(r) S_TRUNC_TRIANGLE(r,hr,RINCR(r,x))
-const int r_area = R_AREA(S_TRIANGLE_A(hr,hr));
-#elif RS == 4
-#define FOR_RESEARCH(r) S_TRIANGLE(r,hr,RINCR(r,x))
-const int r_area = R_AREA(S_TRIANGLE_A(hr,R));
-#elif RS == 3
-#define FOR_RESEARCH(r) S_DIAMOND(r,hr,RINCR(r,y))
-const int r_area = R_AREA(S_DIAMOND_A(hr,R));
-#elif RS == 2
-#define FOR_RESEARCH(r) S_VERTICAL(r,hr,RINCR(r,y))
-const int r_area = R_AREA(R);
-#elif RS == 1
-#define FOR_RESEARCH(r) S_HORIZONTAL(r,hr,RINCR(r,x))
-const int r_area = R_AREA(R);
-#elif RS == 0
-#define FOR_RESEARCH(r) S_SQUARE(r,hr,RINCR(r,y))
-const int r_area = R_AREA(R*R);
-#endif
-
-#define RI1 (RI+1)
-#define RFI1 (RFI+1)
-
-#if RI
-#define FOR_ROTATION for (float ri = 0; ri < 360; ri+=360.0/RI1)
-#else
-#define FOR_ROTATION
-#endif
-
-#if RFI
-#define FOR_REFLECTION for (int rfi = 0; rfi < RFI1; rfi++)
-#else
-#define FOR_REFLECTION
-#endif
-
-#if PD
-#define PINCR DINCR
-#else
-#define PINCR(z,c) (z.c++)
-#endif
-
-#define P_AREA(a) (a - PD)
-
-// patch shapes
-#if P == 0 || P == 1
-#define FOR_PATCH(p) S_1X1(p)
-const int p_area = P_AREA(1);
-#elif PS == 7
-#define FOR_PATCH(p) S_PLUS(p,hp,PINCR(p,y))
-const int p_area = P_AREA(S_PLUS_A(hp,P));
-#elif PS == 6
-#define FOR_PATCH(p) S_SQUARE_EVEN(p,hp,PINCR(p,y))
-const int p_area = P_AREA(P*P);
-#elif PS == 5
-#define FOR_PATCH(p) S_TRUNC_TRIANGLE(p,hp,PINCR(p,x))
-const int p_area = P_AREA(S_TRIANGLE_A(hp,hp));
-#elif PS == 4
-#define FOR_PATCH(p) S_TRIANGLE(p,hp,PINCR(p,x))
-const int p_area = P_AREA(S_TRIANGLE_A(hp,P));
-#elif PS == 3
-#define FOR_PATCH(p) S_DIAMOND(p,hp,PINCR(p,y))
-const int p_area = P_AREA(S_DIAMOND_A(hp,P));
-#elif PS == 2
-#define FOR_PATCH(p) S_VERTICAL(p,hp,PINCR(p,y))
-const int p_area = P_AREA(P);
-#elif PS == 1
-#define FOR_PATCH(p) S_HORIZONTAL(p,hp,PINCR(p,x))
-const int p_area = P_AREA(P);
-#elif PS == 0
-#define FOR_PATCH(p) S_SQUARE(p,hp,PINCR(p,y))
-const int p_area = P_AREA(P*P);
-#endif
-
-const float r_scale = 1.0/r_area;
-const float p_scale = 1.0/p_area;
-
-#define load_(off)  HOOKED_tex(HOOKED_pos + HOOKED_pt * vec2(off))
-
-#if RF && defined(LUMA_raw)
-#define load2_(off) RF_LUMA_tex(RF_LUMA_pos + RF_LUMA_pt * vec2(off))
-#define gather_offs(off, off_arr) (RF_LUMA_mul * vec4(textureGatherOffsets(RF_LUMA_raw, RF_LUMA_pos + vec2(off) * RF_LUMA_pt, off_arr)))
-#define gather(off) RF_LUMA_gather(RF_LUMA_pos + (off) * RF_LUMA_pt, 0)
-#elif RF && D1W
-#define load2_(off) RF_tex(RF_pos + RF_pt * vec2(off))
-#define gather_offs(off, off_arr) (RF_mul * vec4(textureGatherOffsets(RF_raw, RF_pos + vec2(off) * RF_pt, off_arr)))
-#define gather(off) RF_gather(RF_pos + (off) * RF_pt, 0)
-#elif RF
-#define load2_(off) RF_tex(RF_pos + RF_pt * vec2(off))
-#else
-#define load2_(off) HOOKED_tex(HOOKED_pos + HOOKED_pt * vec2(off))
-#define gather_offs(off, off_arr) (HOOKED_mul * vec4(textureGatherOffsets(HOOKED_raw, HOOKED_pos + vec2(off) * HOOKED_pt, off_arr)))
-#define gather(off) HOOKED_gather(HOOKED_pos + (off)*HOOKED_pt, 0)
-#endif
-
-#if T
-vec4 load(vec3 off)
-{
-	switch (int(off.z)) {
-	case 0: return load_(off);
-	}
-}
-vec4 load2(vec3 off)
-{
-	switch (int(off.z)) {
-	case 0: return load2_(off);
-	}
-}
-#else
-#define load(off) load_(off)
-#define load2(off) load2_(off)
-#endif
-
-vec4 poi = load(vec3(0)); // pixel-of-interest
-vec4 poi2 = load2(vec3(0)); // guide pixel-of-interest
-
-#if RI // rotation
-vec2 rot(vec2 p, float d)
-{
-	return vec2(
-		p.x * cos(radians(d)) - p.y * sin(radians(d)),
-		p.y * sin(radians(d)) + p.x * cos(radians(d))
-	);
-}
-#else
-#define rot(p, d) (p)
-#endif
-
-#if RFI // reflection
-vec2 ref(vec2 p, int d)
-{
-	switch (d) {
-	case 0: return p;
-	case 1: return p * vec2(1, -1);
-	case 2: return p * vec2(-1, 1);
-	}
-}
-#else
-#define ref(p, d) (p)
-#endif
-
-vec4 patch_comparison(vec3 r, vec3 r2)
-{
-	vec3 p;
-	vec4 min_rot = vec4(p_area);
-
-	FOR_ROTATION FOR_REFLECTION {
-		vec4 pdiff_sq = vec4(0);
-		FOR_PATCH(p) {
-			vec3 transformed_p = vec3(ref(rot(p.xy, ri), rfi), p.z);
-			vec4 diff_sq = pow(load2(p + r2) - load2((transformed_p + r) * SF), vec4(2));
-#if PST && P >= PST
-			float pdist = exp(-pow(length(p.xy*PSD)*PSS, 2));
-			diff_sq = pow(max(diff_sq, EPSILON), vec4(pdist));
-#endif
-			pdiff_sq += diff_sq;
-		}
-		min_rot = min(min_rot, pdiff_sq);
-	}
-
-	return min_rot * p_scale;
-}
-
-#define NO_GATHER (PD == 0) // never textureGather if any of these conditions are false
-#define REGULAR_ROTATIONS (RI == 0 || RI == 1 || RI == 3)
-
-#if (defined(LUMA_gather) || D1W) && ((PS == 3 || PS == 7) && P == 3) && PST == 0 && M != 1 && REGULAR_ROTATIONS && NO_GATHER
-// 3x3 diamond/plus patch_comparison_gather
-const ivec2 offsets[4] = { ivec2(0,-1), ivec2(-1,0), ivec2(0,1), ivec2(1,0) };
-const ivec2 offsets_sf[4] = { ivec2(0,-1) * SF, ivec2(-1,0) * SF, ivec2(0,1) * SF, ivec2(1,0) * SF };
-vec4 poi_patch = gather_offs(0, offsets);
-vec4 patch_comparison_gather(vec3 r, vec3 r2)
-{
-	float min_rot = p_area - 1;
-	vec4 transformer = gather_offs(r, offsets_sf);
-	FOR_ROTATION {
-		FOR_REFLECTION {
-			float diff_sq = dot(pow(poi_patch - transformer, vec4(2)), vec4(1));
-			min_rot = min(diff_sq, min_rot);
-#if RFI
-			switch(rfi) {
-			case 0: transformer = transformer.zyxw; break;
-			case 1: transformer = transformer.zwxy; break; // undoes last mirror, performs another mirror
-			case 2: transformer = transformer.zyxw; break; // undoes last mirror
-			}
-#endif
-		}
-#if RI == 3
-		transformer = transformer.wxyz;
-#elif RI == 1
-		transformer = transformer.zwxy;
-#endif
-	}
-	return vec4(min_rot + pow(poi2.x - load2(r).x, 2), 0, 0, 0) * p_scale;
-}
-#elif (defined(LUMA_gather) || D1W) && PS == 6 && REGULAR_ROTATIONS && NO_GATHER
-// tiled even square patch_comparison_gather
-vec4 patch_comparison_gather(vec3 r, vec3 r2)
-{
-	vec2 tile;
-	float min_rot = p_area;
-
-	/* gather order:
-	 * w z
-	 * x y
-	 */
-	FOR_ROTATION FOR_REFLECTION {
-		float pdiff_sq = 0;
-		for (tile.x = -hp; tile.x < hp; tile.x+=2) for (tile.y = -hp; tile.y < hp; tile.y+=2) {
-			vec4 poi_patch = gather(tile + r2.xy);
-			vec4 transformer = gather(ref(rot(tile + 0.5, ri), rfi) - 0.5 + r.xy);
-
-#if RI
-			for (float i = 0; i < ri; i+=90)
-				transformer = transformer.wxyz; // rotate 90 degrees
-#endif
-#if RFI // XXX output is a little off
-			switch(rfi) {
-			case 1: transformer = transformer.zyxw; break;
-			case 2: transformer = transformer.xwzy; break;
-			}
-#endif
-
-			vec4 diff_sq = pow(poi_patch - transformer, vec4(2));
-#if PST && P >= PST
-			vec4 pdist = vec4(
-				exp(-pow(length((tile+vec2(0,1))*PSD)*PSS, 2)),
-				exp(-pow(length((tile+vec2(1,1))*PSD)*PSS, 2)),
-				exp(-pow(length((tile+vec2(1,0))*PSD)*PSS, 2)),
-				exp(-pow(length((tile+vec2(0,0))*PSD)*PSS, 2))
-			);
-			diff_sq = pow(max(diff_sq, EPSILON), pdist);
-#endif
-			pdiff_sq += dot(diff_sq, vec4(1));
-		}
-		min_rot = min(min_rot, pdiff_sq);
-	}
-
-	return vec4(min_rot, 0, 0, 0) * p_scale;
-}
-#else
-#define patch_comparison_gather patch_comparison
-#endif
-
-vec4 hook()
-{
-	vec4 total_weight = vec4(0);
-	vec4 sum = vec4(0);
-	vec4 result = vec4(0);
-
-	vec3 r = vec3(0);
-	vec3 p = vec3(0);
-	vec3 me = vec3(0);
-
-#if T && ME == 1 // temporal & motion estimation
-	vec3 me_tmp = vec3(0);
-	float maxweight = 0;
-#elif T && ME == 2 // temporal & motion estimation
-	vec3 me_sum = vec3(0);
-	float me_weight = 0;
-#endif
-
-#if WD == 2 || M == 3 // weight discard, weighted median intensities
-	int r_index = 0;
-	vec4 all_weights[r_area];
-	vec4 all_pixels[r_area];
-#elif WD == 1 // weight discard
-	vec4 no_weights = vec4(0);
-	vec4 discard_total_weight = vec4(0);
-	vec4 discard_sum = vec4(0);
-#endif
-
-#if M == 1 // Euclidean medians
-	vec4 minsum = vec4(0);
-#endif
-
-	FOR_FRAME(r) {
-#if T && ME == 1 // temporal & motion estimation max weight
-	if (r.z > 0) {
-		me += me_tmp;
-		me_tmp = vec3(0);
-		maxweight = 0;
-	}
-#elif T && ME == 2 // temporal & motion estimation weighted average
-	if (r.z > 0) {
-		me += round(me_sum / me_weight);
-		me_sum = vec3(0);
-		me_weight = 0;
-	}
-#endif
-	FOR_RESEARCH(r) {
-		// main NLM logic
-		const float h = S*0.013;
-		const float pdiff_scale = 1.0/(h*h);
-		vec4 pdiff_sq = (r.z == 0) ? patch_comparison_gather(r+me, vec3(0)) : patch_comparison(r+me, vec3(0));
-		vec4 weight = exp(-pdiff_sq * pdiff_scale);
-
-#if T && ME == 1 // temporal & motion estimation max weight
-		me_tmp = vec3(r.xy,0) * step(maxweight, weight.x) + me_tmp * (1 - step(maxweight, weight.x));
-		maxweight = max(maxweight, weight.x);
-#elif T && ME == 2 // temporal & motion estimation weighted average
-		me_sum += vec3(r.xy,0) * weight.x;
-		me_weight += weight.x;
-#endif
-
-#if D1W
-		weight = vec4(weight.x);
-#endif
-
-		weight *= exp(-pow(length(r*SD)*SS, 2)); // spatial kernel
-
-#if WD == 2 || M == 3 // weight discard, weighted median intensity
-		all_weights[r_index] = weight;
-		all_pixels[r_index] = load(r+me);
-		r_index++;
-#elif WD == 1 // weight discard
-		vec4 wd_scale = 1.0/max(no_weights, 1);
-		vec4 keeps = step(total_weight*wd_scale * WDT*exp(-wd_scale*WDP), weight);
-		discard_sum += load(r+me) * weight * (1 - keeps);
-		discard_total_weight += weight * (1 - keeps);
-		no_weights += keeps;
-#endif
-
-		sum += load(r+me) * weight;
-		total_weight += weight;
-
-#if M == 1 // Euclidean median
-		// Based on: https://arxiv.org/abs/1207.3056
-		// XXX might not work with ME
-		vec3 r2;
-		vec4 wpdist_sum = vec4(0);
-		FOR_FRAME(r2) FOR_RESEARCH(r2) {
-			vec4 pdist = (r.z + r2.z) == 0 ? patch_comparison_gather(r+me, r2+me) : patch_comparison(r+me, r2+me);
-			wpdist_sum += sqrt(pdist) * (1-weight);
-		}
-
-		vec4 newmin = step(wpdist_sum, minsum); // wpdist_sum <= minsum
-		newmin *= 1 - step(wpdist_sum, vec4(0)); // && wpdist_sum > 0
-		newmin += step(minsum, vec4(0)); // || minsum <= 0
-		newmin = min(newmin, 1);
-
-		minsum = (newmin * wpdist_sum) + ((1-newmin) * minsum);
-		result = (newmin * load(r+me)) + ((1-newmin) * result);
-#endif
-	} // FOR_RESEARCH
-	} // FOR_FRAME
-
-#if T // temporal
-#endif
-
-	vec4 avg_weight = total_weight * r_scale;
-	vec4 old_avg_weight = avg_weight;
-
-#if WD == 2 // true average
-	total_weight = vec4(0);
-	sum = vec4(0);
-	vec4 no_weights = vec4(0);
-
-	for (int i = 0; i < r_area; i++) {
-		vec4 keeps = step(avg_weight*WDT, all_weights[i]);
-		all_weights[i] *= keeps;
-		sum += all_pixels[i] * all_weights[i];
-		total_weight += all_weights[i];
-		no_weights += keeps;
-	}
-#elif WD == 1 // moving cumulative average
-	total_weight -= discard_total_weight;
-	sum -= discard_sum;
-#endif
-#if WD // weight discard
-	avg_weight = total_weight / no_weights;
-#endif
-
-	total_weight += SW;
-	sum += poi * SW;
-
-#if M == 3 // weighted median intensity
-	const float hr_area = r_area/2.0;
-	vec4 is_median, gt, lt, gte, lte, neq;
-
-	for (int i = 0; i < r_area; i++) {
-		gt = lt = vec4(0);
-		for (int j = 0; j < r_area; j++) {
-			gte = step(all_pixels[i]*all_weights[i], all_pixels[j]*all_weights[j]);
-			lte = step(all_pixels[j]*all_weights[j], all_pixels[i]*all_weights[i]);
-			neq = 1 - gte * lte;
-			gt += gte * neq;
-			lt += lte * neq;
-		}
-		is_median = step(gt, vec4(hr_area)) * step(lt, vec4(hr_area));
-		result += step(result, vec4(0)) * is_median * all_pixels[i];
-	}
-#elif M == 2 // weight map
-	result = avg_weight;
-#elif M == 0 // mean
-	result = sum / total_weight;
-#endif
-
-#if AS == 1 // sharpen+denoise
-	vec4 sharpened = result + (poi - result) * ASF;
-	vec4 sharpening_power = pow(avg_weight, vec4(ASP));
-#elif AS == 2 // sharpen only
-	vec4 sharpened = poi + (poi - result) * ASF;
-	vec4 sharpening_power = pow(avg_weight, vec4(ASP));
-#endif
-
-#if EP // extremes preserve
-	float luminance = EP_texOff(0).x;
-	// EPSILON is needed since pow(0,0) is undefined
-	float ep_weight = pow(max(min(1-luminance, luminance)*2, EPSILON), (luminance < 0.5 ? DP : BP));
-	result = mix(poi, result, ep_weight);
-#endif
-
-#if AS == 1 // sharpen+denoise
-	result = mix(sharpened, result, sharpening_power);
-#elif AS == 2 // sharpen only
-	result = mix(sharpened, poi, sharpening_power);
-#endif
-
-	return mix(poi, result, BF);
-}
-
diff --git a/portable_config/shaders/nlmeans_heavy.glsl b/portable_config/shaders/nlmeans_heavy.glsl
deleted file mode 100644
index aba1573c..00000000
--- a/portable_config/shaders/nlmeans_heavy.glsl
+++ /dev/null
@@ -1,1044 +0,0 @@
-/* vi: ft=c
- *
- * Based on vf_nlmeans.c from FFmpeg.
- *
- * Copyright (c) 2022 an3223 <ethanr2048@gmail.com>
- * Copyright (c) 2016 Clément Bœsch <u pkh me>
- *
- * This program is free software: you can redistribute it and/or modify it 
- * under the terms of the GNU Lesser General Public License as published by 
- * the Free Software Foundation, either version 2.1 of the License, or (at 
- * your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT 
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 
- * for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License 
- * along with this program. If not, see <https://www.gnu.org/licenses/>.
- */
-
-// Profile description: Tuned for heavy noise.
-
-/* The recommended usage of this shader and its variant profiles is to add them 
- * to input.conf and then dispatch the appropriate shader via a keybind during 
- * media playback. Here is an example input.conf entry:
- *
- * F4 no-osd change-list glsl-shaders toggle "~~/shaders/nlmeans_luma.glsl"; show-text "Non-local means (LUMA only)"
- *
- * These shaders can also be enabled by default in mpv.conf, for example:
- *
- * glsl-shaders='~~/shaders/nlmeans.glsl'
- *
- * Both of the examples above assume the shaders are located in a subdirectory 
- * named "shaders" within mpv's config directory. Refer to the mpv 
- * documentation for more details.
- *
- * This shader is highly configurable via user variables below. Although the 
- * default settings should offer good quality at a reasonable speed, you are 
- * encouraged to tweak them to your preferences. Be mindful that certain 
- * settings may greatly affect speed.
- *
- * Denoising is most useful for noisy content. If there is no perceptible 
- * noise, you probably won't see a positive difference.
- *
- * The default settings are generally tuned for low noise and high detail 
- * preservation. The "medium" and "heavy" profiles are tuned for higher levels 
- * of noise.
- *
- * The denoiser will not work properly if the content has been upscaled 
- * beforehand, whether it was done by you or someone down the line. Consider 
- * issuing a command to downscale in the mpv console, like so:
- *
- * vf toggle scale=-2:720
- *
- * ...replacing 720 with whatever resolution seems appropriate. Rerun the 
- * command to undo the downscale. It may take some trial-and-error to find the 
- * proper resolution.
- */
-
-/* Regarding speed
- *
- * Speed may vary wildly for different vo and gpu-api settings. Generally 
- * vo=gpu-next and gpu-api=vulkan are recommended for the best speed, but this 
- * may be different for your system.
- *
- * If your GPU doesn't support textureGather, or if you are on a version of mpv 
- * prior to 0.35.0, then consider setting RI/RFI to 0, or try the LQ and VLQ 
- * profiles.
- *
- * textureGather is LUMA only and limited to the following configurations:
- *
- * - PS={3,7}:P=3:PST=0:RI={0,1,3}:RFI={0,1,2}:M!=1
- *   - Default, very fast, rotations and reflections should be free
- *   - If this is unusually slow then try changing gpu-api and vo
- *   - If it's still slow, try setting RI/RFI to 0.
- *
- * - PS=6:RI={0,1,3}:RFI={0,1,2}
- *   - Currently the only scalable variant
- *   - Patch shape is asymmetric on two axis
- *   - Rotations should have very little speed impact
- *   - Reflections may have a significant speed impact
- *
- * Options which always disable textureGather:
- * 	- RF
- * 	- PD
- */
-
-// The following is shader code injected from guided_s.glsl
-/* vi: ft=c
- *
- * Copyright (c) 2022 an3223 <ethanr2048@gmail.com>
- *
- * This program is free software: you can redistribute it and/or modify it 
- * under the terms of the GNU Lesser General Public License as published by 
- * the Free Software Foundation, either version 2.1 of the License, or (at 
- * your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT 
- * ANY WARRANTY;  without even the implied warranty of MERCHANTABILITY or 
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 
- * for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License 
- * along with this program. If not, see <https://www.gnu.org/licenses/>.
- */
-
-//desc: "Self-guided" guided filter
-
-/* The radius can be adjusted with the MEANIP stage's downscaling factor. 
- * Higher numbers give a bigger radius.
- *
- * The E variable can be found in the A stage.
- *
- * The subsampling (fast guided filter) can be adjusted with the IP stage's 
- * downscaling factor. Higher numbers are faster.
- */
-
-//!HOOK LUMA
-//!HOOK CHROMA
-//!HOOK RGB
-//!DESC Guided filter (IP)
-//!BIND HOOKED
-//!WIDTH HOOKED.w 1.0 /
-//!HEIGHT HOOKED.h 1.0 /
-//!SAVE _INJ_IP
-
-vec4 hook()
-{
-	 return HOOKED_texOff(0); 
-}
-
-//!HOOK LUMA
-//!HOOK CHROMA
-//!HOOK RGB
-//!DESC Guided filter (MEANIP)
-//!BIND _INJ_IP
-//!WIDTH _INJ_IP.w 2.0 /
-//!HEIGHT _INJ_IP.h 2.0 /
-//!SAVE _INJ_MEANIP
-
-vec4 hook()
-{
-return _INJ_IP_texOff(0);
-}
-
-//!HOOK LUMA
-//!HOOK CHROMA
-//!HOOK RGB
-//!DESC Guided filter (_INJ_IP_SQ)
-//!BIND _INJ_IP
-//!WIDTH _INJ_IP.w
-//!HEIGHT _INJ_IP.h
-//!SAVE _INJ_IP_SQ
-
-vec4 hook()
-{
-return _INJ_IP_texOff(0) * _INJ_IP_texOff(0);
-}
-
-//!HOOK LUMA
-//!HOOK CHROMA
-//!HOOK RGB
-//!DESC Guided filter (CORRIP)
-//!BIND _INJ_IP_SQ
-//!WIDTH _INJ_MEANIP.w
-//!HEIGHT _INJ_MEANIP.h
-//!SAVE _INJ_CORRIP
-
-vec4 hook()
-{
-return _INJ_IP_SQ_texOff(0);
-}
-
-//!HOOK LUMA
-//!HOOK CHROMA
-//!HOOK RGB
-//!DESC Guided filter (A)
-//!BIND _INJ_MEANIP
-//!BIND _INJ_CORRIP
-//!WIDTH _INJ_IP.w
-//!HEIGHT _INJ_IP.h
-//!SAVE _INJ_A
-
-#define E 0.001
-
-vec4 hook()
-{
-vec4 var = _INJ_CORRIP_texOff(0) - _INJ_MEANIP_texOff(0) * _INJ_MEANIP_texOff(0);
-	 vec4 cov = var; 
-	 return cov / (var + E); 
-}
-
-//!HOOK LUMA
-//!HOOK CHROMA
-//!HOOK RGB
-//!DESC Guided filter (B)
-//!BIND _INJ_A
-//!BIND _INJ_MEANIP
-//!WIDTH _INJ_IP.w
-//!HEIGHT _INJ_IP.h
-//!SAVE _INJ_B
-
-vec4 hook()
-{
-return _INJ_MEANIP_texOff(0) - _INJ_A_texOff(0) * _INJ_MEANIP_texOff(0);
-}
-
-//!HOOK LUMA
-//!HOOK CHROMA
-//!HOOK RGB
-//!DESC Guided filter (MEANA)
-//!BIND _INJ_A
-//!WIDTH _INJ_MEANIP.w
-//!HEIGHT _INJ_MEANIP.h
-//!SAVE _INJ_MEANA
-
-vec4 hook()
-{
-return _INJ_A_texOff(0);
-}
-
-//!HOOK LUMA
-//!HOOK CHROMA
-//!HOOK RGB
-//!DESC Guided filter (MEANB)
-//!BIND _INJ_B
-//!WIDTH _INJ_MEANIP.w
-//!HEIGHT _INJ_MEANIP.h
-//!SAVE _INJ_MEANB
-
-vec4 hook()
-{
-return _INJ_B_texOff(0);
-}
-
-//!HOOK LUMA
-//!HOOK CHROMA
-//!HOOK RGB
-//!DESC Guided filter
-//!BIND HOOKED
-//!BIND _INJ_MEANA
-//!BIND _INJ_MEANB
-//!SAVE RF_LUMA
-
-vec4 hook()
-{
-return _INJ_MEANA_texOff(0) * HOOKED_texOff(0) + _INJ_MEANB_texOff(0);
-}
-
-// End of source code injected from guided_s.glsl
-//!HOOK LUMA
-//!HOOK CHROMA
-//!HOOK RGB
-//!DESC Non-local means (share)
-//!BIND RF_LUMA
-//!SAVE RF
-
-vec4 hook()
-{
-	return RF_LUMA_texOff(0);
-}
-
-//!HOOK LUMA
-//!HOOK CHROMA
-//!HOOK RGB
-//!BIND HOOKED
-//!BIND RF_LUMA
-//!BIND RF
-//!DESC Non-local means (nlmeans_heavy.glsl)
-
-/* User variables
- *
- * It is usually preferable to denoise chroma and luma differently, so the user 
- * variables for luma and chroma are split.
- */
-
-/* S = denoising factor
- * P = patch size
- * R = research size
- *
- * The denoising factor controls the level of blur, higher is blurrier.
- *
- * Patch size should usually be an odd number greater than or equal to 3. 
- * Higher values are slower and not always better.
- *
- * Research size usually be an odd number greater than or equal to 3. Higher 
- * values are usually better, but slower and offer diminishing returns.
- *
- * Even-numbered patch/research sizes will sample between pixels unless PS=6. 
- * It's not known whether this is ever useful behavior or not. This is 
- * incompatible with textureGather optimizations, so enable RF when using even 
- * patch/research sizes.
- */
-#ifdef LUMA_raw
-#define S 2.25
-#define P 3
-#define R 5
-#else
-#define S 1.50
-#define P 3
-#define R 5
-#endif
-
-/* Adaptive sharpening
- *
- * Uses the blur incurred by denoising plus the weight map to perform an 
- * unsharp mask that gets applied most strongly to edges.
- *
- * Sharpening will amplify noise, so the denoising factor (S) should usually be 
- * increased to compensate.
- *
- * AS: 2 for sharpening, 1 for sharpening+denoising, 0 to disable
- * ASF: Sharpening factor, higher numbers make a sharper underlying image
- * ASP: Weight power, higher numbers use more of the sharp image
- */
-#ifdef LUMA_raw
-#define AS 0
-#define ASF 1.0
-#define ASP 2.0
-#else
-#define AS 0
-#define ASF 1.0
-#define ASP 2.0
-#endif
-
-/* Starting weight
- *
- * Lower numbers give less weight to the pixel-of-interest, which may help 
- * handle higher noise levels, ringing, and may be useful for other things too?
- *
- * EPSILON should be used instead of zero to avoid divide-by-zero errors. The 
- * avg_weight variable may be used to make SW adapt to the local noise level, 
- * e.g., SW=max(avg_weight, EPSILON)
- */
-#ifdef LUMA_raw
-#define SW EPSILON
-#else
-#define SW EPSILON
-#endif
-
-/* Weight discard
- *
- * Discard weights that fall below a fraction of the average weight. This culls 
- * the most dissimilar samples from the blur, yielding a much more pleasant 
- * result, especially around edges.
- * 
- * WD:
- * 	- 2: True average. Very good quality, but slower and uses more memory.
- * 	- 1: Moving cumulative average. Inaccurate, tends to blur directionally.
- * 	- 0: Disable
- *
- * WDT: Threshold coefficient, higher numbers discard more
- * WDP (WD=1): Higher numbers reduce the threshold more for small sample sizes
- */
-#ifdef LUMA_raw
-#define WD 2
-#define WDT 1.0
-#define WDP 6.0
-#else
-#define WD 2
-#define WDT 1.0
-#define WDP 6.0
-#endif
-
-/* Search shape
- *
- * Determines the shape of patches and research zones. Different shapes have 
- * different speed and quality characteristics. Every shape (besides square) is 
- * smaller than square.
- *
- * PS applies applies to patches, RS applies to research zones.
- *
- * 0: square (symmetrical)
- * 1: horizontal line (asymmetric)
- * 2: vertical line (asymmetric)
- * 3: diamond (symmetrical)
- * 4: triangle (asymmetric, pointing upward)
- * 5: truncated triangle (asymmetric on two axis, last row halved)
- * 6: even sized square (asymmetric on two axis)
- * 7: plus (symmetrical)
- */
-#ifdef LUMA_raw
-#define RS 3
-#define PS 3
-#else
-#define RS 3
-#define PS 3
-#endif
-
-/* Rotational/reflectional invariance
- *
- * Number of rotations/reflections to try for each patch comparison. Slow, but 
- * improves feature preservation, although adding more rotations/reflections 
- * gives diminishing returns. The most similar rotation/reflection will be used.
- *
- * The angle in degrees of each rotation is 360/(RI+1), so RI=1 will do a 
- * single 180 degree rotation, RI=3 will do three 90 degree rotations, etc.
- *
- * RI: Rotational invariance
- * RFI (0 to 2): Reflectional invariance
- */
-#ifdef LUMA_raw
-#define RI 3
-#define RFI 2
-#else
-#define RI 0
-#define RFI 0
-#endif
-
-/* Temporal denoising
- *
- * Caveats:
- * 	- Slower, each frame needs to be researched
- * 	- Requires vo=gpu-next and nlmeans_temporal.glsl
- * 	- Luma-only (this is a bug)
- * 	- Buggy
- *
- * Gather samples across multiple frames. May cause motion blur and may 
- * struggle more with noise that persists across multiple frames (compression 
- * noise, repeating frames), but can work very well on high quality video.
- *
- * Motion estimation (ME) should improve quality without impacting speed.
- *
- * T: number of frames used
- * ME: motion estimation, 0 for none, 1 for max weight, 2 for weighted avg
- */
-#ifdef LUMA_raw
-#define T 0
-#define ME 1
-#else
-#define T 0
-#define ME 0
-#endif
-
-/* Spatial kernel
- *
- * Increasing the spatial denoising factor (SS) reduces the weight of further 
- * pixels.
- *
- * Spatial distortion instructs the spatial kernel to view that axis as 
- * closer/further, for instance SD=(1,1,0.5) would make the temporal axis 
- * appear closer and increase blur between frames.
- *
- * The intra-patch variants do not yet have well-understood effects. They are 
- * intended to make large patch sizes more useful. Likely slower.
- *
- * SS: spatial denoising factor
- * SD: spatial distortion (X, Y, time)
- * PSS: intra-patch spatial denoising factor
- * PST: enables intra-patch spatial kernel if P>=PST, 0 fully disables
- * PSD: intra-patch spatial distortion (X, Y)
- */
-#ifdef LUMA_raw
-#define SS 0.25
-#define SD vec3(1,1,1)
-#define PST 0
-#define PSS 0.0
-#define PSD vec2(1,1)
-#else
-#define SS 0.25
-#define SD vec3(1,1,1)
-#define PST 0
-#define PSS 0.0
-#define PSD vec2(1,1)
-#endif
-
-/* Extremes preserve
- *
- * Reduces denoising around very bright/dark areas. The downscaling factor of 
- * EP (located near the top of this shader) controls the area sampled for 
- * luminance (higher numbers consider more area).
- *
- * EP: 1 to enable, 0 to disable
- * DP: EP strength on dark patches, 0 to fully denoise
- * BP: EP strength on bright patches, 0 to fully denoise
- */
-#ifdef LUMA_raw
-#define EP 0
-#define BP 0.75
-#define DP 0.25
-#else
-#define EP 0
-#define BP 0.0
-#define DP 0.0
-#endif
-
-/* Robust filtering
- *
- * This setting is dependent on code generation from nlmeans_cfg, so this 
- * setting can only be enabled via nlmeans_cfg.
- *
- * Compares the pixel-of-interest against downscaled pixels.
- *
- * This will virtually always improve quality, but will always disable 
- * textureGather optimizations.
- *
- * The downscale factor can be modified in the WIDTH/HEIGHT directives for the 
- * RF texture (for CHROMA, RGB) and RF_LUMA (LUMA only) textures near the top 
- * of this shader, higher numbers increase blur.
- *
- * Any notation of RF as a positive number should be assumed to be referring to 
- * the downscaling factor, e.g., RF=3 means RF is set to 1 and the downscaling 
- * factor is set to 3.
- */
-#ifdef LUMA_raw
-#define RF 1
-#else
-#define RF 1
-#endif
-
-/* Blur factor
- *
- * 0 to 1, only useful for alternative estimators. You're probably looking for 
- * "S" (denoising factor), go back to the top of the shader!
- */
-#ifdef LUMA_raw
-#define BF 1.0
-#else
-#define BF 1.0
-#endif
-
-/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
-/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
-/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
-/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
-/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
-
-// Scaling factor (should match WIDTH/HEIGHT)
-#ifdef LUMA_raw
-#define SF 1
-#else
-#define SF 1
-#endif
-
-/* Estimator
- *
- * 0: means
- * 1: Euclidean medians (extremely slow, may be good for heavy noise)
- * 2: weight map (not a denoiser, maybe useful for generating image masks)
- * 3: weighted median intensity (slow, may be good for heavy noise)
- */
-#ifdef LUMA_raw
-#define M 0
-#else
-#define M 0
-#endif
-
-// Patch donut (probably useless)
-#ifdef LUMA_raw
-#define PD 0
-#else
-#define PD 0
-#endif
-
-// Duplicate 1st weight
-#ifdef LUMA_raw
-#define D1W 0
-#else
-#define D1W 0
-#endif
-
-/* Shader code */
-
-#define EPSILON 0.00000000001
-
-#if PS == 6
-const int hp = P/2;
-#else
-const float hp = int(P/2) - 0.5*(1-(P%2)); // sample between pixels for even patch sizes
-#endif
-
-#if RS == 6
-const int hr = R/2;
-#else
-const float hr = int(R/2) - 0.5*(1-(R%2)); // sample between pixels for even research sizes
-#endif
-
-// donut increment, increments without landing on (0,0,0)
-// much faster than a "continue" statement
-#define DINCR(z,c) (z.c++,(z.c += int(z == vec3(0))))
-
-// search shapes and their corresponding areas
-#define S_1X1(z) for (z = vec3(0); z.x <= 0; z.x++)
-
-#define S_TRIANGLE(z,hz,incr) for (z.y = -hz; z.y <= 0; z.y++) for (z.x = -abs(abs(z.y) - hz); z.x <= abs(abs(z.y) - hz); incr)
-#define S_TRUNC_TRIANGLE(z,hz,incr) for (z.y = -hz; z.y <= 0; z.y++) for (z.x = -abs(abs(z.y) - hz); z.x <= abs(abs(z.y) - hz)*int(z.y!=0); incr)
-#define S_TRIANGLE_A(hz,Z) int(pow(hz, 2)+Z)
-
-#define S_DIAMOND(z,hz,incr) for (z.x = -hz; z.x <= hz; z.x++) for (z.y = -abs(abs(z.x) - hz); z.y <= abs(abs(z.x) - hz); incr)
-#define S_DIAMOND_A(hz,Z) int(pow(hz, 2)*2+Z)
-
-#define S_VERTICAL(z,hz,incr) for (z.x = 0; z.x <= 0; z.x++) for (z.y = -hz; z.y <= hz; incr)
-#define S_HORIZONTAL(z,hz,incr) for (z.x = -hz; z.x <= hz; incr) for (z.y = 0; z.y <= 0; z.y++)
-
-#define S_PLUS(z,hz,incr) for (z.x = -hz; z.x <= hz; z.x++) for (z.y = -hz * int(z.x == 0); z.y <= hz * int(z.x == 0); incr)
-#define S_PLUS_A(hz,Z) (Z*2 - 1)
-
-#define S_SQUARE(z,hz,incr) for (z.x = -hz; z.x <= hz; z.x++) for (z.y = -hz; z.y <= hz; incr)
-#define S_SQUARE_EVEN(z,hz,incr) for (z.x = -hz; z.x < hz; z.x++) for (z.y = -hz; z.y < hz; incr)
-
-#define T1 (T+1)
-#define FOR_FRAME(r) for (r.z = 0; r.z < T1; r.z++)
-
-// Skip comparing the pixel-of-interest against itself, unless RF is enabled
-#if RF
-#define RINCR(z,c) (z.c++)
-#else
-#define RINCR DINCR
-#endif
-
-#define R_AREA(a) (a * T1 + RF-1)
-
-// research shapes
-#if R == 0 || R == 1
-#define FOR_RESEARCH(r) S_1X1(r)
-const int r_area = R_AREA(1);
-#elif RS == 7
-#define FOR_RESEARCH(r) S_PLUS(r,hr,RINCR(r,y))
-const int r_area = R_AREA(S_PLUS_A(hr,R));
-#elif RS == 6
-#define FOR_RESEARCH(r) S_SQUARE_EVEN(r,hr,RINCR(r,y))
-const int r_area = R_AREA(R*R);
-#elif RS == 5
-#define FOR_RESEARCH(r) S_TRUNC_TRIANGLE(r,hr,RINCR(r,x))
-const int r_area = R_AREA(S_TRIANGLE_A(hr,hr));
-#elif RS == 4
-#define FOR_RESEARCH(r) S_TRIANGLE(r,hr,RINCR(r,x))
-const int r_area = R_AREA(S_TRIANGLE_A(hr,R));
-#elif RS == 3
-#define FOR_RESEARCH(r) S_DIAMOND(r,hr,RINCR(r,y))
-const int r_area = R_AREA(S_DIAMOND_A(hr,R));
-#elif RS == 2
-#define FOR_RESEARCH(r) S_VERTICAL(r,hr,RINCR(r,y))
-const int r_area = R_AREA(R);
-#elif RS == 1
-#define FOR_RESEARCH(r) S_HORIZONTAL(r,hr,RINCR(r,x))
-const int r_area = R_AREA(R);
-#elif RS == 0
-#define FOR_RESEARCH(r) S_SQUARE(r,hr,RINCR(r,y))
-const int r_area = R_AREA(R*R);
-#endif
-
-#define RI1 (RI+1)
-#define RFI1 (RFI+1)
-
-#if RI
-#define FOR_ROTATION for (float ri = 0; ri < 360; ri+=360.0/RI1)
-#else
-#define FOR_ROTATION
-#endif
-
-#if RFI
-#define FOR_REFLECTION for (int rfi = 0; rfi < RFI1; rfi++)
-#else
-#define FOR_REFLECTION
-#endif
-
-#if PD
-#define PINCR DINCR
-#else
-#define PINCR(z,c) (z.c++)
-#endif
-
-#define P_AREA(a) (a - PD)
-
-// patch shapes
-#if P == 0 || P == 1
-#define FOR_PATCH(p) S_1X1(p)
-const int p_area = P_AREA(1);
-#elif PS == 7
-#define FOR_PATCH(p) S_PLUS(p,hp,PINCR(p,y))
-const int p_area = P_AREA(S_PLUS_A(hp,P));
-#elif PS == 6
-#define FOR_PATCH(p) S_SQUARE_EVEN(p,hp,PINCR(p,y))
-const int p_area = P_AREA(P*P);
-#elif PS == 5
-#define FOR_PATCH(p) S_TRUNC_TRIANGLE(p,hp,PINCR(p,x))
-const int p_area = P_AREA(S_TRIANGLE_A(hp,hp));
-#elif PS == 4
-#define FOR_PATCH(p) S_TRIANGLE(p,hp,PINCR(p,x))
-const int p_area = P_AREA(S_TRIANGLE_A(hp,P));
-#elif PS == 3
-#define FOR_PATCH(p) S_DIAMOND(p,hp,PINCR(p,y))
-const int p_area = P_AREA(S_DIAMOND_A(hp,P));
-#elif PS == 2
-#define FOR_PATCH(p) S_VERTICAL(p,hp,PINCR(p,y))
-const int p_area = P_AREA(P);
-#elif PS == 1
-#define FOR_PATCH(p) S_HORIZONTAL(p,hp,PINCR(p,x))
-const int p_area = P_AREA(P);
-#elif PS == 0
-#define FOR_PATCH(p) S_SQUARE(p,hp,PINCR(p,y))
-const int p_area = P_AREA(P*P);
-#endif
-
-const float r_scale = 1.0/r_area;
-const float p_scale = 1.0/p_area;
-
-#define load_(off)  HOOKED_tex(HOOKED_pos + HOOKED_pt * vec2(off))
-
-#if RF && defined(LUMA_raw)
-#define load2_(off) RF_LUMA_tex(RF_LUMA_pos + RF_LUMA_pt * vec2(off))
-#define gather_offs(off, off_arr) (RF_LUMA_mul * vec4(textureGatherOffsets(RF_LUMA_raw, RF_LUMA_pos + vec2(off) * RF_LUMA_pt, off_arr)))
-#define gather(off) RF_LUMA_gather(RF_LUMA_pos + (off) * RF_LUMA_pt, 0)
-#elif RF && D1W
-#define load2_(off) RF_tex(RF_pos + RF_pt * vec2(off))
-#define gather_offs(off, off_arr) (RF_mul * vec4(textureGatherOffsets(RF_raw, RF_pos + vec2(off) * RF_pt, off_arr)))
-#define gather(off) RF_gather(RF_pos + (off) * RF_pt, 0)
-#elif RF
-#define load2_(off) RF_tex(RF_pos + RF_pt * vec2(off))
-#else
-#define load2_(off) HOOKED_tex(HOOKED_pos + HOOKED_pt * vec2(off))
-#define gather_offs(off, off_arr) (HOOKED_mul * vec4(textureGatherOffsets(HOOKED_raw, HOOKED_pos + vec2(off) * HOOKED_pt, off_arr)))
-#define gather(off) HOOKED_gather(HOOKED_pos + (off)*HOOKED_pt, 0)
-#endif
-
-#if T
-vec4 load(vec3 off)
-{
-	switch (int(off.z)) {
-	case 0: return load_(off);
-	}
-}
-vec4 load2(vec3 off)
-{
-	switch (int(off.z)) {
-	case 0: return load2_(off);
-	}
-}
-#else
-#define load(off) load_(off)
-#define load2(off) load2_(off)
-#endif
-
-vec4 poi = load(vec3(0)); // pixel-of-interest
-vec4 poi2 = load2(vec3(0)); // guide pixel-of-interest
-
-#if RI // rotation
-vec2 rot(vec2 p, float d)
-{
-	return vec2(
-		p.x * cos(radians(d)) - p.y * sin(radians(d)),
-		p.y * sin(radians(d)) + p.x * cos(radians(d))
-	);
-}
-#else
-#define rot(p, d) (p)
-#endif
-
-#if RFI // reflection
-vec2 ref(vec2 p, int d)
-{
-	switch (d) {
-	case 0: return p;
-	case 1: return p * vec2(1, -1);
-	case 2: return p * vec2(-1, 1);
-	}
-}
-#else
-#define ref(p, d) (p)
-#endif
-
-vec4 patch_comparison(vec3 r, vec3 r2)
-{
-	vec3 p;
-	vec4 min_rot = vec4(p_area);
-
-	FOR_ROTATION FOR_REFLECTION {
-		vec4 pdiff_sq = vec4(0);
-		FOR_PATCH(p) {
-			vec3 transformed_p = vec3(ref(rot(p.xy, ri), rfi), p.z);
-			vec4 diff_sq = pow(load2(p + r2) - load2((transformed_p + r) * SF), vec4(2));
-#if PST && P >= PST
-			float pdist = exp(-pow(length(p.xy*PSD)*PSS, 2));
-			diff_sq = pow(max(diff_sq, EPSILON), vec4(pdist));
-#endif
-			pdiff_sq += diff_sq;
-		}
-		min_rot = min(min_rot, pdiff_sq);
-	}
-
-	return min_rot * p_scale;
-}
-
-#define NO_GATHER (PD == 0) // never textureGather if any of these conditions are false
-#define REGULAR_ROTATIONS (RI == 0 || RI == 1 || RI == 3)
-
-#if (defined(LUMA_gather) || D1W) && ((PS == 3 || PS == 7) && P == 3) && PST == 0 && M != 1 && REGULAR_ROTATIONS && NO_GATHER
-// 3x3 diamond/plus patch_comparison_gather
-const ivec2 offsets[4] = { ivec2(0,-1), ivec2(-1,0), ivec2(0,1), ivec2(1,0) };
-const ivec2 offsets_sf[4] = { ivec2(0,-1) * SF, ivec2(-1,0) * SF, ivec2(0,1) * SF, ivec2(1,0) * SF };
-vec4 poi_patch = gather_offs(0, offsets);
-vec4 patch_comparison_gather(vec3 r, vec3 r2)
-{
-	float min_rot = p_area - 1;
-	vec4 transformer = gather_offs(r, offsets_sf);
-	FOR_ROTATION {
-		FOR_REFLECTION {
-			float diff_sq = dot(pow(poi_patch - transformer, vec4(2)), vec4(1));
-			min_rot = min(diff_sq, min_rot);
-#if RFI
-			switch(rfi) {
-			case 0: transformer = transformer.zyxw; break;
-			case 1: transformer = transformer.zwxy; break; // undoes last mirror, performs another mirror
-			case 2: transformer = transformer.zyxw; break; // undoes last mirror
-			}
-#endif
-		}
-#if RI == 3
-		transformer = transformer.wxyz;
-#elif RI == 1
-		transformer = transformer.zwxy;
-#endif
-	}
-	return vec4(min_rot + pow(poi2.x - load2(r).x, 2), 0, 0, 0) * p_scale;
-}
-#elif (defined(LUMA_gather) || D1W) && PS == 6 && REGULAR_ROTATIONS && NO_GATHER
-// tiled even square patch_comparison_gather
-vec4 patch_comparison_gather(vec3 r, vec3 r2)
-{
-	vec2 tile;
-	float min_rot = p_area;
-
-	/* gather order:
-	 * w z
-	 * x y
-	 */
-	FOR_ROTATION FOR_REFLECTION {
-		float pdiff_sq = 0;
-		for (tile.x = -hp; tile.x < hp; tile.x+=2) for (tile.y = -hp; tile.y < hp; tile.y+=2) {
-			vec4 poi_patch = gather(tile + r2.xy);
-			vec4 transformer = gather(ref(rot(tile + 0.5, ri), rfi) - 0.5 + r.xy);
-
-#if RI
-			for (float i = 0; i < ri; i+=90)
-				transformer = transformer.wxyz; // rotate 90 degrees
-#endif
-#if RFI // XXX output is a little off
-			switch(rfi) {
-			case 1: transformer = transformer.zyxw; break;
-			case 2: transformer = transformer.xwzy; break;
-			}
-#endif
-
-			vec4 diff_sq = pow(poi_patch - transformer, vec4(2));
-#if PST && P >= PST
-			vec4 pdist = vec4(
-				exp(-pow(length((tile+vec2(0,1))*PSD)*PSS, 2)),
-				exp(-pow(length((tile+vec2(1,1))*PSD)*PSS, 2)),
-				exp(-pow(length((tile+vec2(1,0))*PSD)*PSS, 2)),
-				exp(-pow(length((tile+vec2(0,0))*PSD)*PSS, 2))
-			);
-			diff_sq = pow(max(diff_sq, EPSILON), pdist);
-#endif
-			pdiff_sq += dot(diff_sq, vec4(1));
-		}
-		min_rot = min(min_rot, pdiff_sq);
-	}
-
-	return vec4(min_rot, 0, 0, 0) * p_scale;
-}
-#else
-#define patch_comparison_gather patch_comparison
-#endif
-
-vec4 hook()
-{
-	vec4 total_weight = vec4(0);
-	vec4 sum = vec4(0);
-	vec4 result = vec4(0);
-
-	vec3 r = vec3(0);
-	vec3 p = vec3(0);
-	vec3 me = vec3(0);
-
-#if T && ME == 1 // temporal & motion estimation
-	vec3 me_tmp = vec3(0);
-	float maxweight = 0;
-#elif T && ME == 2 // temporal & motion estimation
-	vec3 me_sum = vec3(0);
-	float me_weight = 0;
-#endif
-
-#if WD == 2 || M == 3 // weight discard, weighted median intensities
-	int r_index = 0;
-	vec4 all_weights[r_area];
-	vec4 all_pixels[r_area];
-#elif WD == 1 // weight discard
-	vec4 no_weights = vec4(0);
-	vec4 discard_total_weight = vec4(0);
-	vec4 discard_sum = vec4(0);
-#endif
-
-#if M == 1 // Euclidean medians
-	vec4 minsum = vec4(0);
-#endif
-
-	FOR_FRAME(r) {
-#if T && ME == 1 // temporal & motion estimation max weight
-	if (r.z > 0) {
-		me += me_tmp;
-		me_tmp = vec3(0);
-		maxweight = 0;
-	}
-#elif T && ME == 2 // temporal & motion estimation weighted average
-	if (r.z > 0) {
-		me += round(me_sum / me_weight);
-		me_sum = vec3(0);
-		me_weight = 0;
-	}
-#endif
-	FOR_RESEARCH(r) {
-		// main NLM logic
-		const float h = S*0.013;
-		const float pdiff_scale = 1.0/(h*h);
-		vec4 pdiff_sq = (r.z == 0) ? patch_comparison_gather(r+me, vec3(0)) : patch_comparison(r+me, vec3(0));
-		vec4 weight = exp(-pdiff_sq * pdiff_scale);
-
-#if T && ME == 1 // temporal & motion estimation max weight
-		me_tmp = vec3(r.xy,0) * step(maxweight, weight.x) + me_tmp * (1 - step(maxweight, weight.x));
-		maxweight = max(maxweight, weight.x);
-#elif T && ME == 2 // temporal & motion estimation weighted average
-		me_sum += vec3(r.xy,0) * weight.x;
-		me_weight += weight.x;
-#endif
-
-#if D1W
-		weight = vec4(weight.x);
-#endif
-
-		weight *= exp(-pow(length(r*SD)*SS, 2)); // spatial kernel
-
-#if WD == 2 || M == 3 // weight discard, weighted median intensity
-		all_weights[r_index] = weight;
-		all_pixels[r_index] = load(r+me);
-		r_index++;
-#elif WD == 1 // weight discard
-		vec4 wd_scale = 1.0/max(no_weights, 1);
-		vec4 keeps = step(total_weight*wd_scale * WDT*exp(-wd_scale*WDP), weight);
-		discard_sum += load(r+me) * weight * (1 - keeps);
-		discard_total_weight += weight * (1 - keeps);
-		no_weights += keeps;
-#endif
-
-		sum += load(r+me) * weight;
-		total_weight += weight;
-
-#if M == 1 // Euclidean median
-		// Based on: https://arxiv.org/abs/1207.3056
-		// XXX might not work with ME
-		vec3 r2;
-		vec4 wpdist_sum = vec4(0);
-		FOR_FRAME(r2) FOR_RESEARCH(r2) {
-			vec4 pdist = (r.z + r2.z) == 0 ? patch_comparison_gather(r+me, r2+me) : patch_comparison(r+me, r2+me);
-			wpdist_sum += sqrt(pdist) * (1-weight);
-		}
-
-		vec4 newmin = step(wpdist_sum, minsum); // wpdist_sum <= minsum
-		newmin *= 1 - step(wpdist_sum, vec4(0)); // && wpdist_sum > 0
-		newmin += step(minsum, vec4(0)); // || minsum <= 0
-		newmin = min(newmin, 1);
-
-		minsum = (newmin * wpdist_sum) + ((1-newmin) * minsum);
-		result = (newmin * load(r+me)) + ((1-newmin) * result);
-#endif
-	} // FOR_RESEARCH
-	} // FOR_FRAME
-
-#if T // temporal
-#endif
-
-	vec4 avg_weight = total_weight * r_scale;
-	vec4 old_avg_weight = avg_weight;
-
-#if WD == 2 // true average
-	total_weight = vec4(0);
-	sum = vec4(0);
-	vec4 no_weights = vec4(0);
-
-	for (int i = 0; i < r_area; i++) {
-		vec4 keeps = step(avg_weight*WDT, all_weights[i]);
-		all_weights[i] *= keeps;
-		sum += all_pixels[i] * all_weights[i];
-		total_weight += all_weights[i];
-		no_weights += keeps;
-	}
-#elif WD == 1 // moving cumulative average
-	total_weight -= discard_total_weight;
-	sum -= discard_sum;
-#endif
-#if WD // weight discard
-	avg_weight = total_weight / no_weights;
-#endif
-
-	total_weight += SW;
-	sum += poi * SW;
-
-#if M == 3 // weighted median intensity
-	const float hr_area = r_area/2.0;
-	vec4 is_median, gt, lt, gte, lte, neq;
-
-	for (int i = 0; i < r_area; i++) {
-		gt = lt = vec4(0);
-		for (int j = 0; j < r_area; j++) {
-			gte = step(all_pixels[i]*all_weights[i], all_pixels[j]*all_weights[j]);
-			lte = step(all_pixels[j]*all_weights[j], all_pixels[i]*all_weights[i]);
-			neq = 1 - gte * lte;
-			gt += gte * neq;
-			lt += lte * neq;
-		}
-		is_median = step(gt, vec4(hr_area)) * step(lt, vec4(hr_area));
-		result += step(result, vec4(0)) * is_median * all_pixels[i];
-	}
-#elif M == 2 // weight map
-	result = avg_weight;
-#elif M == 0 // mean
-	result = sum / total_weight;
-#endif
-
-#if AS == 1 // sharpen+denoise
-	vec4 sharpened = result + (poi - result) * ASF;
-	vec4 sharpening_power = pow(avg_weight, vec4(ASP));
-#elif AS == 2 // sharpen only
-	vec4 sharpened = poi + (poi - result) * ASF;
-	vec4 sharpening_power = pow(avg_weight, vec4(ASP));
-#endif
-
-#if EP // extremes preserve
-	float luminance = EP_texOff(0).x;
-	// EPSILON is needed since pow(0,0) is undefined
-	float ep_weight = pow(max(min(1-luminance, luminance)*2, EPSILON), (luminance < 0.5 ? DP : BP));
-	result = mix(poi, result, ep_weight);
-#endif
-
-#if AS == 1 // sharpen+denoise
-	result = mix(sharpened, result, sharpening_power);
-#elif AS == 2 // sharpen only
-	result = mix(sharpened, poi, sharpening_power);
-#endif
-
-	return mix(poi, result, BF);
-}
-
diff --git a/portable_config/shaders/nlmeans_hq.glsl b/portable_config/shaders/nlmeans_hq.glsl
index 16f13d51..f1d6b4da 100644
--- a/portable_config/shaders/nlmeans_hq.glsl
+++ b/portable_config/shaders/nlmeans_hq.glsl
@@ -82,11 +82,10 @@
  *   - Reflections may have a significant speed impact
  *
  * Options which always disable textureGather:
- * 	- RF
  * 	- PD
  */
 
-// The following is shader code injected from guided_s.glsl
+// The following is shader code injected from guided.glsl
 /* vi: ft=c
  *
  * Copyright (c) 2022 an3223 <ethanr2048@gmail.com>
@@ -105,25 +104,53 @@
  * along with this program. If not, see <https://www.gnu.org/licenses/>.
  */
 
-//desc: "Self-guided" guided filter
+//desc: Guided filter guided by the downscaled image
 
-/* The radius can be adjusted with the MEANIP stage's downscaling factor. 
+/* The radius can be adjusted with the MEANI stage's downscaling factor. 
  * Higher numbers give a bigger radius.
  *
  * The E variable can be found in the A stage.
  *
- * The subsampling (fast guided filter) can be adjusted with the IP stage's 
+ * The subsampling (fast guided filter) can be adjusted with the I stage's 
  * downscaling factor. Higher numbers are faster.
+ *
+ * The guide's subsampling can be adjusted with the PREI stage's downscaling 
+ * factor. Higher numbers downscale more.
  */
 
 //!HOOK LUMA
 //!HOOK CHROMA
-//!HOOK RGB
-//!DESC Guided filter (IP)
+//!DESC Guided filter (PREI)
 //!BIND HOOKED
+//!WIDTH HOOKED.w 1.25 /
+//!HEIGHT HOOKED.h 1.25 /
+//!SAVE _INJ_PREI
+
+vec4 hook()
+{
+	 return HOOKED_texOff(0); 
+}
+
+//!HOOK LUMA
+//!HOOK CHROMA
+//!DESC Guided filter (I)
+//!BIND _INJ_PREI
 //!WIDTH HOOKED.w 1.0 /
 //!HEIGHT HOOKED.h 1.0 /
-//!SAVE _INJ_IP
+//!SAVE _INJ_I
+
+vec4 hook()
+{
+return _INJ_PREI_texOff(0);
+}
+
+//!HOOK LUMA
+//!HOOK CHROMA
+//!DESC Guided filter (P)
+//!BIND HOOKED
+//!WIDTH _INJ_I.w
+//!HEIGHT _INJ_I.h
+//!SAVE _INJ_P
 
 vec4 hook()
 {
@@ -132,87 +159,124 @@ vec4 hook()
 
 //!HOOK LUMA
 //!HOOK CHROMA
-//!HOOK RGB
-//!DESC Guided filter (MEANIP)
-//!BIND _INJ_IP
-//!WIDTH _INJ_IP.w 2.0 /
-//!HEIGHT _INJ_IP.h 2.0 /
-//!SAVE _INJ_MEANIP
+//!DESC Guided filter (MEANI)
+//!BIND _INJ_I
+//!WIDTH _INJ_I.w 1.5 /
+//!HEIGHT _INJ_I.h 1.5 /
+//!SAVE _INJ_MEANI
 
 vec4 hook()
 {
-return _INJ_IP_texOff(0);
+return _INJ_I_texOff(0);
 }
 
 //!HOOK LUMA
 //!HOOK CHROMA
-//!HOOK RGB
-//!DESC Guided filter (_INJ_IP_SQ)
-//!BIND _INJ_IP
-//!WIDTH _INJ_IP.w
-//!HEIGHT _INJ_IP.h
-//!SAVE _INJ_IP_SQ
+//!DESC Guided filter (MEANP)
+//!BIND _INJ_P
+//!WIDTH _INJ_MEANI.w
+//!HEIGHT _INJ_MEANI.h
+//!SAVE _INJ_MEANP
 
 vec4 hook()
 {
-return _INJ_IP_texOff(0) * _INJ_IP_texOff(0);
+return _INJ_P_texOff(0);
 }
 
 //!HOOK LUMA
 //!HOOK CHROMA
-//!HOOK RGB
-//!DESC Guided filter (CORRIP)
-//!BIND _INJ_IP_SQ
-//!WIDTH _INJ_MEANIP.w
-//!HEIGHT _INJ_MEANIP.h
-//!SAVE _INJ_CORRIP
+//!DESC Guided filter (_INJ_I_SQ)
+//!BIND _INJ_I
+//!WIDTH _INJ_I.w
+//!HEIGHT _INJ_I.h
+//!SAVE _INJ_I_SQ
 
 vec4 hook()
 {
-return _INJ_IP_SQ_texOff(0);
+return _INJ_I_texOff(0) * _INJ_I_texOff(0);
+}
+
+//!HOOK LUMA
+//!HOOK CHROMA
+//!DESC Guided filter (_INJ_IXP)
+//!BIND _INJ_I
+//!BIND _INJ_P
+//!WIDTH _INJ_I.w
+//!HEIGHT _INJ_I.h
+//!SAVE _INJ_IXP
+
+vec4 hook()
+{
+return _INJ_I_texOff(0) * _INJ_P_texOff(0);
+}
+
+//!HOOK LUMA
+//!HOOK CHROMA
+//!DESC Guided filter (CORRI)
+//!BIND _INJ_I_SQ
+//!WIDTH _INJ_MEANI.w
+//!HEIGHT _INJ_MEANI.h
+//!SAVE _INJ_CORRI
+
+vec4 hook()
+{
+return _INJ_I_SQ_texOff(0);
+}
+
+//!HOOK LUMA
+//!HOOK CHROMA
+//!DESC Guided filter (CORRP)
+//!BIND _INJ_IXP
+//!WIDTH _INJ_MEANI.w
+//!HEIGHT _INJ_MEANI.h
+//!SAVE _INJ_CORRP
+
+vec4 hook()
+{
+return _INJ_IXP_texOff(0);
 }
 
 //!HOOK LUMA
 //!HOOK CHROMA
-//!HOOK RGB
 //!DESC Guided filter (A)
-//!BIND _INJ_MEANIP
-//!BIND _INJ_CORRIP
-//!WIDTH _INJ_IP.w
-//!HEIGHT _INJ_IP.h
+//!BIND _INJ_MEANI
+//!BIND _INJ_MEANP
+//!BIND _INJ_CORRI
+//!BIND _INJ_CORRP
+//!WIDTH _INJ_I.w
+//!HEIGHT _INJ_I.h
 //!SAVE _INJ_A
 
-#define E 0.001
+#define E 0.0013
 
 vec4 hook()
 {
-vec4 var = _INJ_CORRIP_texOff(0) - _INJ_MEANIP_texOff(0) * _INJ_MEANIP_texOff(0);
-	 vec4 cov = var; 
+vec4 var = _INJ_CORRI_texOff(0) - _INJ_MEANI_texOff(0) * _INJ_MEANI_texOff(0);
+vec4 cov = _INJ_CORRP_texOff(0) - _INJ_MEANI_texOff(0) * _INJ_MEANP_texOff(0);
 	 return cov / (var + E); 
 }
 
 //!HOOK LUMA
 //!HOOK CHROMA
-//!HOOK RGB
 //!DESC Guided filter (B)
 //!BIND _INJ_A
-//!BIND _INJ_MEANIP
-//!WIDTH _INJ_IP.w
-//!HEIGHT _INJ_IP.h
+//!BIND _INJ_MEANI
+//!BIND _INJ_MEANP
+//!WIDTH _INJ_I.w
+//!HEIGHT _INJ_I.h
 //!SAVE _INJ_B
 
 vec4 hook()
 {
-return _INJ_MEANIP_texOff(0) - _INJ_A_texOff(0) * _INJ_MEANIP_texOff(0);
+return _INJ_MEANP_texOff(0) - _INJ_A_texOff(0) * _INJ_MEANI_texOff(0);
 }
 
 //!HOOK LUMA
 //!HOOK CHROMA
-//!HOOK RGB
 //!DESC Guided filter (MEANA)
 //!BIND _INJ_A
-//!WIDTH _INJ_MEANIP.w
-//!HEIGHT _INJ_MEANIP.h
+//!WIDTH _INJ_MEANI.w
+//!HEIGHT _INJ_MEANI.h
 //!SAVE _INJ_MEANA
 
 vec4 hook()
@@ -222,11 +286,10 @@ return _INJ_A_texOff(0);
 
 //!HOOK LUMA
 //!HOOK CHROMA
-//!HOOK RGB
 //!DESC Guided filter (MEANB)
 //!BIND _INJ_B
-//!WIDTH _INJ_MEANIP.w
-//!HEIGHT _INJ_MEANIP.h
+//!WIDTH _INJ_MEANI.w
+//!HEIGHT _INJ_MEANI.h
 //!SAVE _INJ_MEANB
 
 vec4 hook()
@@ -236,7 +299,6 @@ return _INJ_B_texOff(0);
 
 //!HOOK LUMA
 //!HOOK CHROMA
-//!HOOK RGB
 //!DESC Guided filter
 //!BIND HOOKED
 //!BIND _INJ_MEANA
@@ -248,10 +310,9 @@ vec4 hook()
 return _INJ_MEANA_texOff(0) * HOOKED_texOff(0) + _INJ_MEANB_texOff(0);
 }
 
-// End of source code injected from guided_s.glsl
+// End of source code injected from guided.glsl
 //!HOOK LUMA
 //!HOOK CHROMA
-//!HOOK RGB
 //!DESC Non-local means (share)
 //!BIND RF_LUMA
 //!SAVE RF
@@ -263,7 +324,6 @@ vec4 hook()
 
 //!HOOK LUMA
 //!HOOK CHROMA
-//!HOOK RGB
 //!BIND HOOKED
 //!BIND RF_LUMA
 //!BIND RF
@@ -289,8 +349,7 @@ vec4 hook()
  *
  * Even-numbered patch/research sizes will sample between pixels unless PS=6. 
  * It's not known whether this is ever useful behavior or not. This is 
- * incompatible with textureGather optimizations, so enable RF when using even 
- * patch/research sizes.
+ * incompatible with textureGather optimizations, so NG=1 to disable them.
  */
 #ifdef LUMA_raw
 #define S 3
@@ -304,24 +363,37 @@ vec4 hook()
 
 /* Adaptive sharpening
  *
- * Uses the blur incurred by denoising plus the weight map to perform an 
- * unsharp mask that gets applied most strongly to edges.
+ * Uses the blur incurred by denoising to perform an unsharp mask, and uses the 
+ * weight map to restrict the sharpening to edges.
  *
- * Sharpening will amplify noise, so the denoising factor (S) should usually be 
- * increased to compensate.
+ * Use M=4 to get a good look at which areas are/aren't sharpened.
  *
  * AS: 2 for sharpening, 1 for sharpening+denoising, 0 to disable
  * ASF: Sharpening factor, higher numbers make a sharper underlying image
  * ASP: Weight power, higher numbers use more of the sharp image
+ * ASW:
+ * 	- 0 to use pre-WD weights
+ * 	- 1 to use post-WD weights (ASP should be ~2x to compensate)
+ * ASK: Weight kernel:
+ * 	- 0 for power. This is the old method.
+ * 	- 1 for sigmoid. This is generally recommended.
+ * 	- 2 for constant (non-adaptive, w/ ASP=0 this sharpens the entire image)
+ * ASC (only for ASK=1, range 0-1): Reduces the contrast of the edge map
  */
 #ifdef LUMA_raw
 #define AS 0
-#define ASF 1.0
-#define ASP 2.0
+#define ASF 2.0
+#define ASP 32.0
+#define ASW 0
+#define ASK 1
+#define ASC 0.0
 #else
 #define AS 0
-#define ASF 1.0
-#define ASP 2.0
+#define ASF 2.0
+#define ASP 32.0
+#define ASW 0
+#define ASK 1
+#define ASC 0.0
 #endif
 
 /* Starting weight
@@ -330,8 +402,8 @@ vec4 hook()
  * handle higher noise levels, ringing, and may be useful for other things too?
  *
  * EPSILON should be used instead of zero to avoid divide-by-zero errors. The 
- * avg_weight variable may be used to make SW adapt to the local noise level, 
- * e.g., SW=max(avg_weight, EPSILON)
+ * avg_weight/old_avg_weight variables may be used to make SW adapt to the 
+ * local noise level, e.g., SW=max(avg_weight, EPSILON)
  */
 #ifdef LUMA_raw
 #define SW 1.0
@@ -351,7 +423,7 @@ vec4 hook()
  * 	- 0: Disable
  *
  * WDT: Threshold coefficient, higher numbers discard more
- * WDP (WD=1): Higher numbers reduce the threshold more for small sample sizes
+ * WDP (only for WD=1): Increasing reduces the threshold for small sample sizes
  */
 #ifdef LUMA_raw
 #define WD 2
@@ -363,6 +435,49 @@ vec4 hook()
 #define WDP 6.0
 #endif
 
+/* Extremes preserve
+ *
+ * Reduces denoising around very bright/dark areas. The downscaling factor of 
+ * EP (located near the top of this shader) controls the area sampled for 
+ * luminance (higher numbers consider more area).
+ *
+ * This is incompatible with RGB. If you have RGB hooks enabled then you will 
+ * have to delete the EP shader stage or specify EP=0 through nlmeans_cfg.
+ *
+ * EP: 1 to enable, 0 to disable
+ * DP: EP strength on dark patches, 0 to fully denoise
+ * BP: EP strength on bright patches, 0 to fully denoise
+ */
+#ifdef LUMA_raw
+#define EP 0
+#define BP 0.75
+#define DP 0.25
+#else
+#define EP 0
+#define BP 0.0
+#define DP 0.0
+#endif
+
+/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
+/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
+/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
+/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
+/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
+
+/* Robust filtering
+ *
+ * This setting is dependent on code generation from nlmeans_cfg, so this 
+ * setting can only be enabled via nlmeans_cfg.
+ *
+ * Compares the pixel-of-interest against a guide, which could be a downscaled 
+ * image or the output of another shader such as guided.glsl
+ */
+#ifdef LUMA_raw
+#define RF 1
+#else
+#define RF 1
+#endif
+
 /* Search shape
  *
  * Determines the shape of patches and research zones. Different shapes have 
@@ -371,6 +486,8 @@ vec4 hook()
  *
  * PS applies applies to patches, RS applies to research zones.
  *
+ * Be wary of gather optimizations (see the Regarding Speed comment at the top)
+ *
  * 0: square (symmetrical)
  * 1: horizontal line (asymmetric)
  * 2: vertical line (asymmetric)
@@ -417,8 +534,9 @@ vec4 hook()
  * 	- Buggy
  *
  * Gather samples across multiple frames. May cause motion blur and may 
- * struggle more with noise that persists across multiple frames (compression 
- * noise, repeating frames), but can work very well on high quality video.
+ * struggle more with noise that persists across multiple frames (e.g., from 
+ * compression or duplicate frames), but can work very well on high quality 
+ * video.
  *
  * Motion estimation (ME) should improve quality without impacting speed.
  *
@@ -453,60 +571,51 @@ vec4 hook()
  */
 #ifdef LUMA_raw
 #define SS 0.25
-#define SD vec3(1,1,1)
+#define SD vec3(1,1,1.5)
 #define PST 0
 #define PSS 0.0
 #define PSD vec2(1,1)
 #else
 #define SS 0.25
-#define SD vec3(1,1,1)
+#define SD vec3(1,1,1.5)
 #define PST 0
 #define PSS 0.0
 #define PSD vec2(1,1)
 #endif
 
-/* Extremes preserve
- *
- * Reduces denoising around very bright/dark areas. The downscaling factor of 
- * EP (located near the top of this shader) controls the area sampled for 
- * luminance (higher numbers consider more area).
+// Scaling factor (should match WIDTH/HEIGHT)
+#ifdef LUMA_raw
+#define SF 1
+#else
+#define SF 1
+#endif
+
+/* Estimator
  *
- * EP: 1 to enable, 0 to disable
- * DP: EP strength on dark patches, 0 to fully denoise
- * BP: EP strength on bright patches, 0 to fully denoise
+ * 0: means
+ * 1: Euclidean medians (extremely slow, may be good for heavy noise)
+ * 2: weight map (not a denoiser, maybe useful for generating image masks)
+ * 3: weighted median intensity (slow, may be good for heavy noise)
+ * 4: edge map (based on the relevant AS settings)
  */
 #ifdef LUMA_raw
-#define EP 0
-#define BP 0.75
-#define DP 0.25
+#define M 0
 #else
-#define EP 0
-#define BP 0.0
-#define DP 0.0
+#define M 0
 #endif
 
-/* Robust filtering
- *
- * This setting is dependent on code generation from nlmeans_cfg, so this 
- * setting can only be enabled via nlmeans_cfg.
+/* Difference visualization
  *
- * Compares the pixel-of-interest against downscaled pixels.
+ * Visualizes the difference between input/output image
  *
- * This will virtually always improve quality, but will always disable 
- * textureGather optimizations.
- *
- * The downscale factor can be modified in the WIDTH/HEIGHT directives for the 
- * RF texture (for CHROMA, RGB) and RF_LUMA (LUMA only) textures near the top 
- * of this shader, higher numbers increase blur.
- *
- * Any notation of RF as a positive number should be assumed to be referring to 
- * the downscaling factor, e.g., RF=3 means RF is set to 1 and the downscaling 
- * factor is set to 3.
+ * 0: off
+ * 1: absolute difference scaled by S
+ * 2: difference centered on 0.5
  */
 #ifdef LUMA_raw
-#define RF 1
+#define DV 0
 #else
-#define RF 1
+#define DV 0
 #endif
 
 /* Blur factor
@@ -520,30 +629,11 @@ vec4 hook()
 #define BF 1.0
 #endif
 
-/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
-/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
-/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
-/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
-/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
-
-// Scaling factor (should match WIDTH/HEIGHT)
-#ifdef LUMA_raw
-#define SF 1
-#else
-#define SF 1
-#endif
-
-/* Estimator
- *
- * 0: means
- * 1: Euclidean medians (extremely slow, may be good for heavy noise)
- * 2: weight map (not a denoiser, maybe useful for generating image masks)
- * 3: weighted median intensity (slow, may be good for heavy noise)
- */
+// Force disable textureGather
 #ifdef LUMA_raw
-#define M 0
+#define NG 0
 #else
-#define M 0
+#define NG 0
 #endif
 
 // Patch donut (probably useless)
@@ -553,7 +643,7 @@ vec4 hook()
 #define PD 0
 #endif
 
-// Duplicate 1st weight
+// Duplicate 1st weight (for LGC)
 #ifdef LUMA_raw
 #define D1W 0
 #else
@@ -563,6 +653,7 @@ vec4 hook()
 /* Shader code */
 
 #define EPSILON 0.00000000001
+#define M_PI 3.14159265358979323846
 
 #if PS == 6
 const int hp = P/2;
@@ -612,6 +703,7 @@ const float hr = int(R/2) - 0.5*(1-(R%2)); // sample between pixels for even res
 #define R_AREA(a) (a * T1 + RF-1)
 
 // research shapes
+// XXX would be nice to have the option of temporally-varying research sizes
 #if R == 0 || R == 1
 #define FOR_RESEARCH(r) S_1X1(r)
 const int r_area = R_AREA(1);
@@ -783,11 +875,13 @@ vec4 patch_comparison(vec3 r, vec3 r2)
 	return min_rot * p_scale;
 }
 
-#define NO_GATHER (PD == 0) // never textureGather if any of these conditions are false
+#define NO_GATHER (PD == 0 && NG == 0) // never textureGather if any of these conditions are false
 #define REGULAR_ROTATIONS (RI == 0 || RI == 1 || RI == 3)
 
 #if (defined(LUMA_gather) || D1W) && ((PS == 3 || PS == 7) && P == 3) && PST == 0 && M != 1 && REGULAR_ROTATIONS && NO_GATHER
 // 3x3 diamond/plus patch_comparison_gather
+// XXX extend to support arbitrary sizes (probably requires code generation)
+// XXX extend to support 3x3 square
 const ivec2 offsets[4] = { ivec2(0,-1), ivec2(-1,0), ivec2(0,1), ivec2(1,0) };
 const ivec2 offsets_sf[4] = { ivec2(0,-1) * SF, ivec2(-1,0) * SF, ivec2(0,1) * SF, ivec2(1,0) * SF };
 vec4 poi_patch = gather_offs(0, offsets);
@@ -817,6 +911,7 @@ vec4 patch_comparison_gather(vec3 r, vec3 r2)
 }
 #elif (defined(LUMA_gather) || D1W) && PS == 6 && REGULAR_ROTATIONS && NO_GATHER
 // tiled even square patch_comparison_gather
+// XXX extend to support odd square?
 vec4 patch_comparison_gather(vec3 r, vec3 r2)
 {
 	vec2 tile;
@@ -897,6 +992,7 @@ vec4 hook()
 #endif
 
 	FOR_FRAME(r) {
+	// XXX ME is always a frame behind, should have to option to re-research after applying ME (could do it an arbitrary number of times per frame if desired)
 #if T && ME == 1 // temporal & motion estimation max weight
 	if (r.z > 0) {
 		me += me_tmp;
@@ -967,6 +1063,7 @@ vec4 hook()
 	} // FOR_RESEARCH
 	} // FOR_FRAME
 
+	// XXX optionally put the denoised pixel into the frame buffer?
 #if T // temporal
 #endif
 
@@ -1018,12 +1115,29 @@ vec4 hook()
 	result = sum / total_weight;
 #endif
 
+#if ASW == 0 // pre-WD weights
+#define AS_weight old_avg_weight
+#elif ASW == 1 // post-WD weights
+#define AS_weight avg_weight
+#endif
+
+#if ASK == 0
+	vec4 sharpening_strength = pow(AS_weight, vec4(ASP));
+#elif ASK == 1
+#define sigmoid(x) (tanh(x * 2*M_PI - M_PI)*0.5+0.5)
+	vec4 sharpening_strength = mix(pow(sigmoid(AS_weight), vec4(ASP)),
+	                               AS_weight, ASC);
+	// just in case ASC < 0 (will sharpen but it's janky XXX)
+	sharpening_strength = clamp(sharpening_strength, 0.0, 1.0);
+#elif ASK == 2
+	vec4 sharpening_strength = vec4(ASP);
+#endif
+
+	// XXX maybe allow for alternative blurs? e.g., replace result w/ load2?
 #if AS == 1 // sharpen+denoise
 	vec4 sharpened = result + (poi - result) * ASF;
-	vec4 sharpening_power = pow(avg_weight, vec4(ASP));
 #elif AS == 2 // sharpen only
 	vec4 sharpened = poi + (poi - result) * ASF;
-	vec4 sharpening_power = pow(avg_weight, vec4(ASP));
 #endif
 
 #if EP // extremes preserve
@@ -1034,9 +1148,23 @@ vec4 hook()
 #endif
 
 #if AS == 1 // sharpen+denoise
-	result = mix(sharpened, result, sharpening_power);
+	result = mix(sharpened, result, sharpening_strength);
 #elif AS == 2 // sharpen only
-	result = mix(sharpened, poi, sharpening_power);
+	result = mix(sharpened, poi, sharpening_strength);
+#endif
+
+#if M == 4 // edge map
+	result = sharpening_strength;
+#endif
+
+#if (M == 2 || M == 4) && defined(CHROMA_raw) // drop chroma for weight maps
+	result = vec4(0.5);
+#endif
+
+#if DV == 1
+	result = clamp(abs(poi - result) * S, 0.0, 1.0);
+#elif DV == 2
+	result = (poi - result) * 0.5 + 0.5;
 #endif
 
 	return mix(poi, result, BF);
diff --git a/portable_config/shaders/nlmeans_lgc.glsl b/portable_config/shaders/nlmeans_lgc.glsl
index 91da530b..7d18a350 100644
--- a/portable_config/shaders/nlmeans_lgc.glsl
+++ b/portable_config/shaders/nlmeans_lgc.glsl
@@ -82,7 +82,6 @@
  *   - Reflections may have a significant speed impact
  *
  * Options which always disable textureGather:
- * 	- RF
  * 	- PD
  */
 
@@ -136,11 +135,10 @@ vec4 hook()
  *
  * Even-numbered patch/research sizes will sample between pixels unless PS=6. 
  * It's not known whether this is ever useful behavior or not. This is 
- * incompatible with textureGather optimizations, so enable RF when using even 
- * patch/research sizes.
+ * incompatible with textureGather optimizations, so NG=1 to disable them.
  */
 #ifdef LUMA_raw
-#define S 2.25
+#define S 20.0
 #define P 3
 #define R 5
 #else
@@ -151,24 +149,37 @@ vec4 hook()
 
 /* Adaptive sharpening
  *
- * Uses the blur incurred by denoising plus the weight map to perform an 
- * unsharp mask that gets applied most strongly to edges.
+ * Uses the blur incurred by denoising to perform an unsharp mask, and uses the 
+ * weight map to restrict the sharpening to edges.
  *
- * Sharpening will amplify noise, so the denoising factor (S) should usually be 
- * increased to compensate.
+ * Use M=4 to get a good look at which areas are/aren't sharpened.
  *
  * AS: 2 for sharpening, 1 for sharpening+denoising, 0 to disable
  * ASF: Sharpening factor, higher numbers make a sharper underlying image
  * ASP: Weight power, higher numbers use more of the sharp image
+ * ASW:
+ * 	- 0 to use pre-WD weights
+ * 	- 1 to use post-WD weights (ASP should be ~2x to compensate)
+ * ASK: Weight kernel:
+ * 	- 0 for power. This is the old method.
+ * 	- 1 for sigmoid. This is generally recommended.
+ * 	- 2 for constant (non-adaptive, w/ ASP=0 this sharpens the entire image)
+ * ASC (only for ASK=1, range 0-1): Reduces the contrast of the edge map
  */
 #ifdef LUMA_raw
 #define AS 0
-#define ASF 1.0
-#define ASP 2.0
+#define ASF 2.0
+#define ASP 32.0
+#define ASW 0
+#define ASK 1
+#define ASC 0.0
 #else
 #define AS 0
-#define ASF 1.0
-#define ASP 2.0
+#define ASF 2.0
+#define ASP 32.0
+#define ASW 0
+#define ASK 1
+#define ASC 0.0
 #endif
 
 /* Starting weight
@@ -177,8 +188,8 @@ vec4 hook()
  * handle higher noise levels, ringing, and may be useful for other things too?
  *
  * EPSILON should be used instead of zero to avoid divide-by-zero errors. The 
- * avg_weight variable may be used to make SW adapt to the local noise level, 
- * e.g., SW=max(avg_weight, EPSILON)
+ * avg_weight/old_avg_weight variables may be used to make SW adapt to the 
+ * local noise level, e.g., SW=max(avg_weight, EPSILON)
  */
 #ifdef LUMA_raw
 #define SW 1.0
@@ -198,7 +209,7 @@ vec4 hook()
  * 	- 0: Disable
  *
  * WDT: Threshold coefficient, higher numbers discard more
- * WDP (WD=1): Higher numbers reduce the threshold more for small sample sizes
+ * WDP (only for WD=1): Increasing reduces the threshold for small sample sizes
  */
 #ifdef LUMA_raw
 #define WD 2
@@ -210,6 +221,49 @@ vec4 hook()
 #define WDP 6.0
 #endif
 
+/* Extremes preserve
+ *
+ * Reduces denoising around very bright/dark areas. The downscaling factor of 
+ * EP (located near the top of this shader) controls the area sampled for 
+ * luminance (higher numbers consider more area).
+ *
+ * This is incompatible with RGB. If you have RGB hooks enabled then you will 
+ * have to delete the EP shader stage or specify EP=0 through nlmeans_cfg.
+ *
+ * EP: 1 to enable, 0 to disable
+ * DP: EP strength on dark patches, 0 to fully denoise
+ * BP: EP strength on bright patches, 0 to fully denoise
+ */
+#ifdef LUMA_raw
+#define EP 1
+#define BP 0.75
+#define DP 0.25
+#else
+#define EP 0
+#define BP 0.0
+#define DP 0.0
+#endif
+
+/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
+/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
+/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
+/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
+/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
+
+/* Robust filtering
+ *
+ * This setting is dependent on code generation from nlmeans_cfg, so this 
+ * setting can only be enabled via nlmeans_cfg.
+ *
+ * Compares the pixel-of-interest against a guide, which could be a downscaled 
+ * image or the output of another shader such as guided.glsl
+ */
+#ifdef LUMA_raw
+#define RF 0
+#else
+#define RF 1
+#endif
+
 /* Search shape
  *
  * Determines the shape of patches and research zones. Different shapes have 
@@ -218,6 +272,8 @@ vec4 hook()
  *
  * PS applies applies to patches, RS applies to research zones.
  *
+ * Be wary of gather optimizations (see the Regarding Speed comment at the top)
+ *
  * 0: square (symmetrical)
  * 1: horizontal line (asymmetric)
  * 2: vertical line (asymmetric)
@@ -264,8 +320,9 @@ vec4 hook()
  * 	- Buggy
  *
  * Gather samples across multiple frames. May cause motion blur and may 
- * struggle more with noise that persists across multiple frames (compression 
- * noise, repeating frames), but can work very well on high quality video.
+ * struggle more with noise that persists across multiple frames (e.g., from 
+ * compression or duplicate frames), but can work very well on high quality 
+ * video.
  *
  * Motion estimation (ME) should improve quality without impacting speed.
  *
@@ -300,60 +357,51 @@ vec4 hook()
  */
 #ifdef LUMA_raw
 #define SS 0.25
-#define SD vec3(1,1,1)
+#define SD vec3(1,1,1.5)
 #define PST 0
 #define PSS 0.0
 #define PSD vec2(1,1)
 #else
 #define SS 0.25
-#define SD vec3(1,1,1)
+#define SD vec3(1,1,1.5)
 #define PST 0
 #define PSS 0.0
 #define PSD vec2(1,1)
 #endif
 
-/* Extremes preserve
- *
- * Reduces denoising around very bright/dark areas. The downscaling factor of 
- * EP (located near the top of this shader) controls the area sampled for 
- * luminance (higher numbers consider more area).
+// Scaling factor (should match WIDTH/HEIGHT)
+#ifdef LUMA_raw
+#define SF 1
+#else
+#define SF 1
+#endif
+
+/* Estimator
  *
- * EP: 1 to enable, 0 to disable
- * DP: EP strength on dark patches, 0 to fully denoise
- * BP: EP strength on bright patches, 0 to fully denoise
+ * 0: means
+ * 1: Euclidean medians (extremely slow, may be good for heavy noise)
+ * 2: weight map (not a denoiser, maybe useful for generating image masks)
+ * 3: weighted median intensity (slow, may be good for heavy noise)
+ * 4: edge map (based on the relevant AS settings)
  */
 #ifdef LUMA_raw
-#define EP 1
-#define BP 0.75
-#define DP 0.25
+#define M 0
 #else
-#define EP 0
-#define BP 0.0
-#define DP 0.0
+#define M 0
 #endif
 
-/* Robust filtering
- *
- * This setting is dependent on code generation from nlmeans_cfg, so this 
- * setting can only be enabled via nlmeans_cfg.
+/* Difference visualization
  *
- * Compares the pixel-of-interest against downscaled pixels.
+ * Visualizes the difference between input/output image
  *
- * This will virtually always improve quality, but will always disable 
- * textureGather optimizations.
- *
- * The downscale factor can be modified in the WIDTH/HEIGHT directives for the 
- * RF texture (for CHROMA, RGB) and RF_LUMA (LUMA only) textures near the top 
- * of this shader, higher numbers increase blur.
- *
- * Any notation of RF as a positive number should be assumed to be referring to 
- * the downscaling factor, e.g., RF=3 means RF is set to 1 and the downscaling 
- * factor is set to 3.
+ * 0: off
+ * 1: absolute difference scaled by S
+ * 2: difference centered on 0.5
  */
 #ifdef LUMA_raw
-#define RF 0
+#define DV 0
 #else
-#define RF 1
+#define DV 0
 #endif
 
 /* Blur factor
@@ -367,30 +415,11 @@ vec4 hook()
 #define BF 1.0
 #endif
 
-/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
-/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
-/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
-/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
-/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
-
-// Scaling factor (should match WIDTH/HEIGHT)
-#ifdef LUMA_raw
-#define SF 1
-#else
-#define SF 1
-#endif
-
-/* Estimator
- *
- * 0: means
- * 1: Euclidean medians (extremely slow, may be good for heavy noise)
- * 2: weight map (not a denoiser, maybe useful for generating image masks)
- * 3: weighted median intensity (slow, may be good for heavy noise)
- */
+// Force disable textureGather
 #ifdef LUMA_raw
-#define M 0
+#define NG 0
 #else
-#define M 0
+#define NG 0
 #endif
 
 // Patch donut (probably useless)
@@ -400,7 +429,7 @@ vec4 hook()
 #define PD 0
 #endif
 
-// Duplicate 1st weight
+// Duplicate 1st weight (for LGC)
 #ifdef LUMA_raw
 #define D1W 0
 #else
@@ -410,6 +439,7 @@ vec4 hook()
 /* Shader code */
 
 #define EPSILON 0.00000000001
+#define M_PI 3.14159265358979323846
 
 #if PS == 6
 const int hp = P/2;
@@ -459,6 +489,7 @@ const float hr = int(R/2) - 0.5*(1-(R%2)); // sample between pixels for even res
 #define R_AREA(a) (a * T1 + RF-1)
 
 // research shapes
+// XXX would be nice to have the option of temporally-varying research sizes
 #if R == 0 || R == 1
 #define FOR_RESEARCH(r) S_1X1(r)
 const int r_area = R_AREA(1);
@@ -630,11 +661,13 @@ vec4 patch_comparison(vec3 r, vec3 r2)
 	return min_rot * p_scale;
 }
 
-#define NO_GATHER (PD == 0) // never textureGather if any of these conditions are false
+#define NO_GATHER (PD == 0 && NG == 0) // never textureGather if any of these conditions are false
 #define REGULAR_ROTATIONS (RI == 0 || RI == 1 || RI == 3)
 
 #if (defined(LUMA_gather) || D1W) && ((PS == 3 || PS == 7) && P == 3) && PST == 0 && M != 1 && REGULAR_ROTATIONS && NO_GATHER
 // 3x3 diamond/plus patch_comparison_gather
+// XXX extend to support arbitrary sizes (probably requires code generation)
+// XXX extend to support 3x3 square
 const ivec2 offsets[4] = { ivec2(0,-1), ivec2(-1,0), ivec2(0,1), ivec2(1,0) };
 const ivec2 offsets_sf[4] = { ivec2(0,-1) * SF, ivec2(-1,0) * SF, ivec2(0,1) * SF, ivec2(1,0) * SF };
 vec4 poi_patch = gather_offs(0, offsets);
@@ -664,6 +697,7 @@ vec4 patch_comparison_gather(vec3 r, vec3 r2)
 }
 #elif (defined(LUMA_gather) || D1W) && PS == 6 && REGULAR_ROTATIONS && NO_GATHER
 // tiled even square patch_comparison_gather
+// XXX extend to support odd square?
 vec4 patch_comparison_gather(vec3 r, vec3 r2)
 {
 	vec2 tile;
@@ -744,6 +778,7 @@ vec4 hook()
 #endif
 
 	FOR_FRAME(r) {
+	// XXX ME is always a frame behind, should have to option to re-research after applying ME (could do it an arbitrary number of times per frame if desired)
 #if T && ME == 1 // temporal & motion estimation max weight
 	if (r.z > 0) {
 		me += me_tmp;
@@ -814,6 +849,7 @@ vec4 hook()
 	} // FOR_RESEARCH
 	} // FOR_FRAME
 
+	// XXX optionally put the denoised pixel into the frame buffer?
 #if T // temporal
 #endif
 
@@ -865,12 +901,29 @@ vec4 hook()
 	result = sum / total_weight;
 #endif
 
+#if ASW == 0 // pre-WD weights
+#define AS_weight old_avg_weight
+#elif ASW == 1 // post-WD weights
+#define AS_weight avg_weight
+#endif
+
+#if ASK == 0
+	vec4 sharpening_strength = pow(AS_weight, vec4(ASP));
+#elif ASK == 1
+#define sigmoid(x) (tanh(x * 2*M_PI - M_PI)*0.5+0.5)
+	vec4 sharpening_strength = mix(pow(sigmoid(AS_weight), vec4(ASP)),
+	                               AS_weight, ASC);
+	// just in case ASC < 0 (will sharpen but it's janky XXX)
+	sharpening_strength = clamp(sharpening_strength, 0.0, 1.0);
+#elif ASK == 2
+	vec4 sharpening_strength = vec4(ASP);
+#endif
+
+	// XXX maybe allow for alternative blurs? e.g., replace result w/ load2?
 #if AS == 1 // sharpen+denoise
 	vec4 sharpened = result + (poi - result) * ASF;
-	vec4 sharpening_power = pow(avg_weight, vec4(ASP));
 #elif AS == 2 // sharpen only
 	vec4 sharpened = poi + (poi - result) * ASF;
-	vec4 sharpening_power = pow(avg_weight, vec4(ASP));
 #endif
 
 #if EP // extremes preserve
@@ -881,9 +934,23 @@ vec4 hook()
 #endif
 
 #if AS == 1 // sharpen+denoise
-	result = mix(sharpened, result, sharpening_power);
+	result = mix(sharpened, result, sharpening_strength);
 #elif AS == 2 // sharpen only
-	result = mix(sharpened, poi, sharpening_power);
+	result = mix(sharpened, poi, sharpening_strength);
+#endif
+
+#if M == 4 // edge map
+	result = sharpening_strength;
+#endif
+
+#if (M == 2 || M == 4) && defined(CHROMA_raw) // drop chroma for weight maps
+	result = vec4(0.5);
+#endif
+
+#if DV == 1
+	result = clamp(abs(poi - result) * S, 0.0, 1.0);
+#elif DV == 2
+	result = (poi - result) * 0.5 + 0.5;
 #endif
 
 	return mix(poi, result, BF);
diff --git a/portable_config/shaders/nlmeans_lq.glsl b/portable_config/shaders/nlmeans_lq.glsl
index 39ba8e84..23da8884 100644
--- a/portable_config/shaders/nlmeans_lq.glsl
+++ b/portable_config/shaders/nlmeans_lq.glsl
@@ -82,13 +82,11 @@
  *   - Reflections may have a significant speed impact
  *
  * Options which always disable textureGather:
- * 	- RF
  * 	- PD
  */
 
 //!HOOK LUMA
 //!HOOK CHROMA
-//!HOOK RGB
 //!DESC Non-local means (downscale)
 //!BIND HOOKED
 //!SAVE PRERF_LUMA
@@ -102,7 +100,6 @@ vec4 hook()
 
 //!HOOK LUMA
 //!HOOK CHROMA
-//!HOOK RGB
 //!DESC Non-local means (unscale)
 //!BIND PRERF_LUMA
 //!SAVE RF_LUMA
@@ -116,7 +113,6 @@ vec4 hook()
 
 //!HOOK LUMA
 //!HOOK CHROMA
-//!HOOK RGB
 //!DESC Non-local means (downscale)
 //!WIDTH LUMA.w 3 /
 //!HEIGHT LUMA.h 3 /
@@ -130,7 +126,6 @@ vec4 hook()
 
 //!HOOK LUMA
 //!HOOK CHROMA
-//!HOOK RGB
 //!DESC Non-local means (share)
 //!BIND RF_LUMA
 //!SAVE RF
@@ -142,7 +137,6 @@ vec4 hook()
 
 //!HOOK LUMA
 //!HOOK CHROMA
-//!HOOK RGB
 //!BIND HOOKED
 //!BIND RF_LUMA
 //!BIND EP
@@ -169,8 +163,7 @@ vec4 hook()
  *
  * Even-numbered patch/research sizes will sample between pixels unless PS=6. 
  * It's not known whether this is ever useful behavior or not. This is 
- * incompatible with textureGather optimizations, so enable RF when using even 
- * patch/research sizes.
+ * incompatible with textureGather optimizations, so NG=1 to disable them.
  */
 #ifdef LUMA_raw
 #define S 1.25
@@ -184,24 +177,37 @@ vec4 hook()
 
 /* Adaptive sharpening
  *
- * Uses the blur incurred by denoising plus the weight map to perform an 
- * unsharp mask that gets applied most strongly to edges.
+ * Uses the blur incurred by denoising to perform an unsharp mask, and uses the 
+ * weight map to restrict the sharpening to edges.
  *
- * Sharpening will amplify noise, so the denoising factor (S) should usually be 
- * increased to compensate.
+ * Use M=4 to get a good look at which areas are/aren't sharpened.
  *
  * AS: 2 for sharpening, 1 for sharpening+denoising, 0 to disable
  * ASF: Sharpening factor, higher numbers make a sharper underlying image
  * ASP: Weight power, higher numbers use more of the sharp image
+ * ASW:
+ * 	- 0 to use pre-WD weights
+ * 	- 1 to use post-WD weights (ASP should be ~2x to compensate)
+ * ASK: Weight kernel:
+ * 	- 0 for power. This is the old method.
+ * 	- 1 for sigmoid. This is generally recommended.
+ * 	- 2 for constant (non-adaptive, w/ ASP=0 this sharpens the entire image)
+ * ASC (only for ASK=1, range 0-1): Reduces the contrast of the edge map
  */
 #ifdef LUMA_raw
 #define AS 0
-#define ASF 1.0
-#define ASP 2.0
+#define ASF 2.0
+#define ASP 32.0
+#define ASW 0
+#define ASK 1
+#define ASC 0.0
 #else
 #define AS 0
-#define ASF 1.0
-#define ASP 2.0
+#define ASF 2.0
+#define ASP 32.0
+#define ASW 0
+#define ASK 1
+#define ASC 0.0
 #endif
 
 /* Starting weight
@@ -210,8 +216,8 @@ vec4 hook()
  * handle higher noise levels, ringing, and may be useful for other things too?
  *
  * EPSILON should be used instead of zero to avoid divide-by-zero errors. The 
- * avg_weight variable may be used to make SW adapt to the local noise level, 
- * e.g., SW=max(avg_weight, EPSILON)
+ * avg_weight/old_avg_weight variables may be used to make SW adapt to the 
+ * local noise level, e.g., SW=max(avg_weight, EPSILON)
  */
 #ifdef LUMA_raw
 #define SW 1.0
@@ -231,7 +237,7 @@ vec4 hook()
  * 	- 0: Disable
  *
  * WDT: Threshold coefficient, higher numbers discard more
- * WDP (WD=1): Higher numbers reduce the threshold more for small sample sizes
+ * WDP (only for WD=1): Increasing reduces the threshold for small sample sizes
  */
 #ifdef LUMA_raw
 #define WD 1
@@ -243,6 +249,49 @@ vec4 hook()
 #define WDP 6.0
 #endif
 
+/* Extremes preserve
+ *
+ * Reduces denoising around very bright/dark areas. The downscaling factor of 
+ * EP (located near the top of this shader) controls the area sampled for 
+ * luminance (higher numbers consider more area).
+ *
+ * This is incompatible with RGB. If you have RGB hooks enabled then you will 
+ * have to delete the EP shader stage or specify EP=0 through nlmeans_cfg.
+ *
+ * EP: 1 to enable, 0 to disable
+ * DP: EP strength on dark patches, 0 to fully denoise
+ * BP: EP strength on bright patches, 0 to fully denoise
+ */
+#ifdef LUMA_raw
+#define EP 1
+#define BP 0.75
+#define DP 0.25
+#else
+#define EP 0
+#define BP 0.0
+#define DP 0.0
+#endif
+
+/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
+/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
+/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
+/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
+/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
+
+/* Robust filtering
+ *
+ * This setting is dependent on code generation from nlmeans_cfg, so this 
+ * setting can only be enabled via nlmeans_cfg.
+ *
+ * Compares the pixel-of-interest against a guide, which could be a downscaled 
+ * image or the output of another shader such as guided.glsl
+ */
+#ifdef LUMA_raw
+#define RF 1
+#else
+#define RF 1
+#endif
+
 /* Search shape
  *
  * Determines the shape of patches and research zones. Different shapes have 
@@ -251,6 +300,8 @@ vec4 hook()
  *
  * PS applies applies to patches, RS applies to research zones.
  *
+ * Be wary of gather optimizations (see the Regarding Speed comment at the top)
+ *
  * 0: square (symmetrical)
  * 1: horizontal line (asymmetric)
  * 2: vertical line (asymmetric)
@@ -297,8 +348,9 @@ vec4 hook()
  * 	- Buggy
  *
  * Gather samples across multiple frames. May cause motion blur and may 
- * struggle more with noise that persists across multiple frames (compression 
- * noise, repeating frames), but can work very well on high quality video.
+ * struggle more with noise that persists across multiple frames (e.g., from 
+ * compression or duplicate frames), but can work very well on high quality 
+ * video.
  *
  * Motion estimation (ME) should improve quality without impacting speed.
  *
@@ -333,60 +385,51 @@ vec4 hook()
  */
 #ifdef LUMA_raw
 #define SS 0.25
-#define SD vec3(1,1,1)
+#define SD vec3(1,1,1.5)
 #define PST 0
 #define PSS 0.0
 #define PSD vec2(1,1)
 #else
 #define SS 0.25
-#define SD vec3(1,1,1)
+#define SD vec3(1,1,1.5)
 #define PST 0
 #define PSS 0.0
 #define PSD vec2(1,1)
 #endif
 
-/* Extremes preserve
- *
- * Reduces denoising around very bright/dark areas. The downscaling factor of 
- * EP (located near the top of this shader) controls the area sampled for 
- * luminance (higher numbers consider more area).
+// Scaling factor (should match WIDTH/HEIGHT)
+#ifdef LUMA_raw
+#define SF 1
+#else
+#define SF 1
+#endif
+
+/* Estimator
  *
- * EP: 1 to enable, 0 to disable
- * DP: EP strength on dark patches, 0 to fully denoise
- * BP: EP strength on bright patches, 0 to fully denoise
+ * 0: means
+ * 1: Euclidean medians (extremely slow, may be good for heavy noise)
+ * 2: weight map (not a denoiser, maybe useful for generating image masks)
+ * 3: weighted median intensity (slow, may be good for heavy noise)
+ * 4: edge map (based on the relevant AS settings)
  */
 #ifdef LUMA_raw
-#define EP 1
-#define BP 0.75
-#define DP 0.25
+#define M 0
 #else
-#define EP 0
-#define BP 0.0
-#define DP 0.0
+#define M 0
 #endif
 
-/* Robust filtering
- *
- * This setting is dependent on code generation from nlmeans_cfg, so this 
- * setting can only be enabled via nlmeans_cfg.
+/* Difference visualization
  *
- * Compares the pixel-of-interest against downscaled pixels.
+ * Visualizes the difference between input/output image
  *
- * This will virtually always improve quality, but will always disable 
- * textureGather optimizations.
- *
- * The downscale factor can be modified in the WIDTH/HEIGHT directives for the 
- * RF texture (for CHROMA, RGB) and RF_LUMA (LUMA only) textures near the top 
- * of this shader, higher numbers increase blur.
- *
- * Any notation of RF as a positive number should be assumed to be referring to 
- * the downscaling factor, e.g., RF=3 means RF is set to 1 and the downscaling 
- * factor is set to 3.
+ * 0: off
+ * 1: absolute difference scaled by S
+ * 2: difference centered on 0.5
  */
 #ifdef LUMA_raw
-#define RF 1
+#define DV 0
 #else
-#define RF 1
+#define DV 0
 #endif
 
 /* Blur factor
@@ -400,30 +443,11 @@ vec4 hook()
 #define BF 1.0
 #endif
 
-/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
-/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
-/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
-/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
-/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
-
-// Scaling factor (should match WIDTH/HEIGHT)
-#ifdef LUMA_raw
-#define SF 1
-#else
-#define SF 1
-#endif
-
-/* Estimator
- *
- * 0: means
- * 1: Euclidean medians (extremely slow, may be good for heavy noise)
- * 2: weight map (not a denoiser, maybe useful for generating image masks)
- * 3: weighted median intensity (slow, may be good for heavy noise)
- */
+// Force disable textureGather
 #ifdef LUMA_raw
-#define M 0
+#define NG 0
 #else
-#define M 0
+#define NG 0
 #endif
 
 // Patch donut (probably useless)
@@ -433,7 +457,7 @@ vec4 hook()
 #define PD 0
 #endif
 
-// Duplicate 1st weight
+// Duplicate 1st weight (for LGC)
 #ifdef LUMA_raw
 #define D1W 0
 #else
@@ -443,6 +467,7 @@ vec4 hook()
 /* Shader code */
 
 #define EPSILON 0.00000000001
+#define M_PI 3.14159265358979323846
 
 #if PS == 6
 const int hp = P/2;
@@ -492,6 +517,7 @@ const float hr = int(R/2) - 0.5*(1-(R%2)); // sample between pixels for even res
 #define R_AREA(a) (a * T1 + RF-1)
 
 // research shapes
+// XXX would be nice to have the option of temporally-varying research sizes
 #if R == 0 || R == 1
 #define FOR_RESEARCH(r) S_1X1(r)
 const int r_area = R_AREA(1);
@@ -663,11 +689,13 @@ vec4 patch_comparison(vec3 r, vec3 r2)
 	return min_rot * p_scale;
 }
 
-#define NO_GATHER (PD == 0) // never textureGather if any of these conditions are false
+#define NO_GATHER (PD == 0 && NG == 0) // never textureGather if any of these conditions are false
 #define REGULAR_ROTATIONS (RI == 0 || RI == 1 || RI == 3)
 
 #if (defined(LUMA_gather) || D1W) && ((PS == 3 || PS == 7) && P == 3) && PST == 0 && M != 1 && REGULAR_ROTATIONS && NO_GATHER
 // 3x3 diamond/plus patch_comparison_gather
+// XXX extend to support arbitrary sizes (probably requires code generation)
+// XXX extend to support 3x3 square
 const ivec2 offsets[4] = { ivec2(0,-1), ivec2(-1,0), ivec2(0,1), ivec2(1,0) };
 const ivec2 offsets_sf[4] = { ivec2(0,-1) * SF, ivec2(-1,0) * SF, ivec2(0,1) * SF, ivec2(1,0) * SF };
 vec4 poi_patch = gather_offs(0, offsets);
@@ -697,6 +725,7 @@ vec4 patch_comparison_gather(vec3 r, vec3 r2)
 }
 #elif (defined(LUMA_gather) || D1W) && PS == 6 && REGULAR_ROTATIONS && NO_GATHER
 // tiled even square patch_comparison_gather
+// XXX extend to support odd square?
 vec4 patch_comparison_gather(vec3 r, vec3 r2)
 {
 	vec2 tile;
@@ -777,6 +806,7 @@ vec4 hook()
 #endif
 
 	FOR_FRAME(r) {
+	// XXX ME is always a frame behind, should have to option to re-research after applying ME (could do it an arbitrary number of times per frame if desired)
 #if T && ME == 1 // temporal & motion estimation max weight
 	if (r.z > 0) {
 		me += me_tmp;
@@ -847,6 +877,7 @@ vec4 hook()
 	} // FOR_RESEARCH
 	} // FOR_FRAME
 
+	// XXX optionally put the denoised pixel into the frame buffer?
 #if T // temporal
 #endif
 
@@ -898,12 +929,29 @@ vec4 hook()
 	result = sum / total_weight;
 #endif
 
+#if ASW == 0 // pre-WD weights
+#define AS_weight old_avg_weight
+#elif ASW == 1 // post-WD weights
+#define AS_weight avg_weight
+#endif
+
+#if ASK == 0
+	vec4 sharpening_strength = pow(AS_weight, vec4(ASP));
+#elif ASK == 1
+#define sigmoid(x) (tanh(x * 2*M_PI - M_PI)*0.5+0.5)
+	vec4 sharpening_strength = mix(pow(sigmoid(AS_weight), vec4(ASP)),
+	                               AS_weight, ASC);
+	// just in case ASC < 0 (will sharpen but it's janky XXX)
+	sharpening_strength = clamp(sharpening_strength, 0.0, 1.0);
+#elif ASK == 2
+	vec4 sharpening_strength = vec4(ASP);
+#endif
+
+	// XXX maybe allow for alternative blurs? e.g., replace result w/ load2?
 #if AS == 1 // sharpen+denoise
 	vec4 sharpened = result + (poi - result) * ASF;
-	vec4 sharpening_power = pow(avg_weight, vec4(ASP));
 #elif AS == 2 // sharpen only
 	vec4 sharpened = poi + (poi - result) * ASF;
-	vec4 sharpening_power = pow(avg_weight, vec4(ASP));
 #endif
 
 #if EP // extremes preserve
@@ -914,9 +962,23 @@ vec4 hook()
 #endif
 
 #if AS == 1 // sharpen+denoise
-	result = mix(sharpened, result, sharpening_power);
+	result = mix(sharpened, result, sharpening_strength);
 #elif AS == 2 // sharpen only
-	result = mix(sharpened, poi, sharpening_power);
+	result = mix(sharpened, poi, sharpening_strength);
+#endif
+
+#if M == 4 // edge map
+	result = sharpening_strength;
+#endif
+
+#if (M == 2 || M == 4) && defined(CHROMA_raw) // drop chroma for weight maps
+	result = vec4(0.5);
+#endif
+
+#if DV == 1
+	result = clamp(abs(poi - result) * S, 0.0, 1.0);
+#elif DV == 2
+	result = (poi - result) * 0.5 + 0.5;
 #endif
 
 	return mix(poi, result, BF);
diff --git a/portable_config/shaders/nlmeans_luma.glsl b/portable_config/shaders/nlmeans_luma.glsl
index 3e8a733c..6944449c 100644
--- a/portable_config/shaders/nlmeans_luma.glsl
+++ b/portable_config/shaders/nlmeans_luma.glsl
@@ -82,11 +82,10 @@
  *   - Reflections may have a significant speed impact
  *
  * Options which always disable textureGather:
- * 	- RF
  * 	- PD
  */
 
-// The following is shader code injected from guided_s.glsl
+// The following is shader code injected from guided.glsl
 /* vi: ft=c
  *
  * Copyright (c) 2022 an3223 <ethanr2048@gmail.com>
@@ -105,23 +104,50 @@
  * along with this program. If not, see <https://www.gnu.org/licenses/>.
  */
 
-//desc: "Self-guided" guided filter
+//desc: Guided filter guided by the downscaled image
 
-/* The radius can be adjusted with the MEANIP stage's downscaling factor. 
+/* The radius can be adjusted with the MEANI stage's downscaling factor. 
  * Higher numbers give a bigger radius.
  *
  * The E variable can be found in the A stage.
  *
- * The subsampling (fast guided filter) can be adjusted with the IP stage's 
+ * The subsampling (fast guided filter) can be adjusted with the I stage's 
  * downscaling factor. Higher numbers are faster.
+ *
+ * The guide's subsampling can be adjusted with the PREI stage's downscaling 
+ * factor. Higher numbers downscale more.
  */
 
 //!HOOK LUMA
-//!DESC Guided filter (IP)
+//!DESC Guided filter (PREI)
 //!BIND HOOKED
+//!WIDTH HOOKED.w 1.25 /
+//!HEIGHT HOOKED.h 1.25 /
+//!SAVE _INJ_PREI
+
+vec4 hook()
+{
+	 return HOOKED_texOff(0); 
+}
+
+//!HOOK LUMA
+//!DESC Guided filter (I)
+//!BIND _INJ_PREI
 //!WIDTH HOOKED.w 1.0 /
 //!HEIGHT HOOKED.h 1.0 /
-//!SAVE _INJ_IP
+//!SAVE _INJ_I
+
+vec4 hook()
+{
+return _INJ_PREI_texOff(0);
+}
+
+//!HOOK LUMA
+//!DESC Guided filter (P)
+//!BIND HOOKED
+//!WIDTH _INJ_I.w
+//!HEIGHT _INJ_I.h
+//!SAVE _INJ_P
 
 vec4 hook()
 {
@@ -129,76 +155,116 @@ vec4 hook()
 }
 
 //!HOOK LUMA
-//!DESC Guided filter (MEANIP)
-//!BIND _INJ_IP
-//!WIDTH _INJ_IP.w 2.0 /
-//!HEIGHT _INJ_IP.h 2.0 /
-//!SAVE _INJ_MEANIP
+//!DESC Guided filter (MEANI)
+//!BIND _INJ_I
+//!WIDTH _INJ_I.w 1.5 /
+//!HEIGHT _INJ_I.h 1.5 /
+//!SAVE _INJ_MEANI
+
+vec4 hook()
+{
+return _INJ_I_texOff(0);
+}
+
+//!HOOK LUMA
+//!DESC Guided filter (MEANP)
+//!BIND _INJ_P
+//!WIDTH _INJ_MEANI.w
+//!HEIGHT _INJ_MEANI.h
+//!SAVE _INJ_MEANP
+
+vec4 hook()
+{
+return _INJ_P_texOff(0);
+}
+
+//!HOOK LUMA
+//!DESC Guided filter (_INJ_I_SQ)
+//!BIND _INJ_I
+//!WIDTH _INJ_I.w
+//!HEIGHT _INJ_I.h
+//!SAVE _INJ_I_SQ
 
 vec4 hook()
 {
-return _INJ_IP_texOff(0);
+return _INJ_I_texOff(0) * _INJ_I_texOff(0);
 }
 
 //!HOOK LUMA
-//!DESC Guided filter (_INJ_IP_SQ)
-//!BIND _INJ_IP
-//!WIDTH _INJ_IP.w
-//!HEIGHT _INJ_IP.h
-//!SAVE _INJ_IP_SQ
+//!DESC Guided filter (_INJ_IXP)
+//!BIND _INJ_I
+//!BIND _INJ_P
+//!WIDTH _INJ_I.w
+//!HEIGHT _INJ_I.h
+//!SAVE _INJ_IXP
 
 vec4 hook()
 {
-return _INJ_IP_texOff(0) * _INJ_IP_texOff(0);
+return _INJ_I_texOff(0) * _INJ_P_texOff(0);
 }
 
 //!HOOK LUMA
-//!DESC Guided filter (CORRIP)
-//!BIND _INJ_IP_SQ
-//!WIDTH _INJ_MEANIP.w
-//!HEIGHT _INJ_MEANIP.h
-//!SAVE _INJ_CORRIP
+//!DESC Guided filter (CORRI)
+//!BIND _INJ_I_SQ
+//!WIDTH _INJ_MEANI.w
+//!HEIGHT _INJ_MEANI.h
+//!SAVE _INJ_CORRI
 
 vec4 hook()
 {
-return _INJ_IP_SQ_texOff(0);
+return _INJ_I_SQ_texOff(0);
+}
+
+//!HOOK LUMA
+//!DESC Guided filter (CORRP)
+//!BIND _INJ_IXP
+//!WIDTH _INJ_MEANI.w
+//!HEIGHT _INJ_MEANI.h
+//!SAVE _INJ_CORRP
+
+vec4 hook()
+{
+return _INJ_IXP_texOff(0);
 }
 
 //!HOOK LUMA
 //!DESC Guided filter (A)
-//!BIND _INJ_MEANIP
-//!BIND _INJ_CORRIP
-//!WIDTH _INJ_IP.w
-//!HEIGHT _INJ_IP.h
+//!BIND _INJ_MEANI
+//!BIND _INJ_MEANP
+//!BIND _INJ_CORRI
+//!BIND _INJ_CORRP
+//!WIDTH _INJ_I.w
+//!HEIGHT _INJ_I.h
 //!SAVE _INJ_A
 
-#define E 0.001
+#define E 0.0013
 
 vec4 hook()
 {
-vec4 var = _INJ_CORRIP_texOff(0) - _INJ_MEANIP_texOff(0) * _INJ_MEANIP_texOff(0);
-	 vec4 cov = var; 
+vec4 var = _INJ_CORRI_texOff(0) - _INJ_MEANI_texOff(0) * _INJ_MEANI_texOff(0);
+vec4 cov = _INJ_CORRP_texOff(0) - _INJ_MEANI_texOff(0) * _INJ_MEANP_texOff(0);
 	 return cov / (var + E); 
 }
 
 //!HOOK LUMA
 //!DESC Guided filter (B)
 //!BIND _INJ_A
-//!BIND _INJ_MEANIP
-//!WIDTH _INJ_IP.w
-//!HEIGHT _INJ_IP.h
+//!BIND _INJ_MEANI
+//!BIND _INJ_MEANP
+//!WIDTH _INJ_I.w
+//!HEIGHT _INJ_I.h
 //!SAVE _INJ_B
 
 vec4 hook()
 {
-return _INJ_MEANIP_texOff(0) - _INJ_A_texOff(0) * _INJ_MEANIP_texOff(0);
+return _INJ_MEANP_texOff(0) - _INJ_A_texOff(0) * _INJ_MEANI_texOff(0);
 }
 
 //!HOOK LUMA
 //!DESC Guided filter (MEANA)
 //!BIND _INJ_A
-//!WIDTH _INJ_MEANIP.w
-//!HEIGHT _INJ_MEANIP.h
+//!WIDTH _INJ_MEANI.w
+//!HEIGHT _INJ_MEANI.h
 //!SAVE _INJ_MEANA
 
 vec4 hook()
@@ -209,8 +275,8 @@ return _INJ_A_texOff(0);
 //!HOOK LUMA
 //!DESC Guided filter (MEANB)
 //!BIND _INJ_B
-//!WIDTH _INJ_MEANIP.w
-//!HEIGHT _INJ_MEANIP.h
+//!WIDTH _INJ_MEANI.w
+//!HEIGHT _INJ_MEANI.h
 //!SAVE _INJ_MEANB
 
 vec4 hook()
@@ -230,7 +296,7 @@ vec4 hook()
 return _INJ_MEANA_texOff(0) * HOOKED_texOff(0) + _INJ_MEANB_texOff(0);
 }
 
-// End of source code injected from guided_s.glsl
+// End of source code injected from guided.glsl
 //!HOOK LUMA
 //!DESC Non-local means (downscale)
 //!WIDTH LUMA.w 3 /
@@ -280,11 +346,10 @@ vec4 hook()
  *
  * Even-numbered patch/research sizes will sample between pixels unless PS=6. 
  * It's not known whether this is ever useful behavior or not. This is 
- * incompatible with textureGather optimizations, so enable RF when using even 
- * patch/research sizes.
+ * incompatible with textureGather optimizations, so NG=1 to disable them.
  */
 #ifdef LUMA_raw
-#define S 2.25
+#define S 20.0
 #define P 3
 #define R 5
 #else
@@ -295,24 +360,37 @@ vec4 hook()
 
 /* Adaptive sharpening
  *
- * Uses the blur incurred by denoising plus the weight map to perform an 
- * unsharp mask that gets applied most strongly to edges.
+ * Uses the blur incurred by denoising to perform an unsharp mask, and uses the 
+ * weight map to restrict the sharpening to edges.
  *
- * Sharpening will amplify noise, so the denoising factor (S) should usually be 
- * increased to compensate.
+ * Use M=4 to get a good look at which areas are/aren't sharpened.
  *
  * AS: 2 for sharpening, 1 for sharpening+denoising, 0 to disable
  * ASF: Sharpening factor, higher numbers make a sharper underlying image
  * ASP: Weight power, higher numbers use more of the sharp image
+ * ASW:
+ * 	- 0 to use pre-WD weights
+ * 	- 1 to use post-WD weights (ASP should be ~2x to compensate)
+ * ASK: Weight kernel:
+ * 	- 0 for power. This is the old method.
+ * 	- 1 for sigmoid. This is generally recommended.
+ * 	- 2 for constant (non-adaptive, w/ ASP=0 this sharpens the entire image)
+ * ASC (only for ASK=1, range 0-1): Reduces the contrast of the edge map
  */
 #ifdef LUMA_raw
 #define AS 0
-#define ASF 1.0
-#define ASP 2.0
+#define ASF 2.0
+#define ASP 32.0
+#define ASW 0
+#define ASK 1
+#define ASC 0.0
 #else
 #define AS 0
-#define ASF 1.0
-#define ASP 2.0
+#define ASF 2.0
+#define ASP 32.0
+#define ASW 0
+#define ASK 1
+#define ASC 0.0
 #endif
 
 /* Starting weight
@@ -321,8 +399,8 @@ vec4 hook()
  * handle higher noise levels, ringing, and may be useful for other things too?
  *
  * EPSILON should be used instead of zero to avoid divide-by-zero errors. The 
- * avg_weight variable may be used to make SW adapt to the local noise level, 
- * e.g., SW=max(avg_weight, EPSILON)
+ * avg_weight/old_avg_weight variables may be used to make SW adapt to the 
+ * local noise level, e.g., SW=max(avg_weight, EPSILON)
  */
 #ifdef LUMA_raw
 #define SW 1.0
@@ -342,7 +420,7 @@ vec4 hook()
  * 	- 0: Disable
  *
  * WDT: Threshold coefficient, higher numbers discard more
- * WDP (WD=1): Higher numbers reduce the threshold more for small sample sizes
+ * WDP (only for WD=1): Increasing reduces the threshold for small sample sizes
  */
 #ifdef LUMA_raw
 #define WD 2
@@ -354,6 +432,49 @@ vec4 hook()
 #define WDP 6.0
 #endif
 
+/* Extremes preserve
+ *
+ * Reduces denoising around very bright/dark areas. The downscaling factor of 
+ * EP (located near the top of this shader) controls the area sampled for 
+ * luminance (higher numbers consider more area).
+ *
+ * This is incompatible with RGB. If you have RGB hooks enabled then you will 
+ * have to delete the EP shader stage or specify EP=0 through nlmeans_cfg.
+ *
+ * EP: 1 to enable, 0 to disable
+ * DP: EP strength on dark patches, 0 to fully denoise
+ * BP: EP strength on bright patches, 0 to fully denoise
+ */
+#ifdef LUMA_raw
+#define EP 1
+#define BP 0.75
+#define DP 0.25
+#else
+#define EP 0
+#define BP 0.0
+#define DP 0.0
+#endif
+
+/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
+/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
+/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
+/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
+/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
+
+/* Robust filtering
+ *
+ * This setting is dependent on code generation from nlmeans_cfg, so this 
+ * setting can only be enabled via nlmeans_cfg.
+ *
+ * Compares the pixel-of-interest against a guide, which could be a downscaled 
+ * image or the output of another shader such as guided.glsl
+ */
+#ifdef LUMA_raw
+#define RF 1
+#else
+#define RF 1
+#endif
+
 /* Search shape
  *
  * Determines the shape of patches and research zones. Different shapes have 
@@ -362,6 +483,8 @@ vec4 hook()
  *
  * PS applies applies to patches, RS applies to research zones.
  *
+ * Be wary of gather optimizations (see the Regarding Speed comment at the top)
+ *
  * 0: square (symmetrical)
  * 1: horizontal line (asymmetric)
  * 2: vertical line (asymmetric)
@@ -408,8 +531,9 @@ vec4 hook()
  * 	- Buggy
  *
  * Gather samples across multiple frames. May cause motion blur and may 
- * struggle more with noise that persists across multiple frames (compression 
- * noise, repeating frames), but can work very well on high quality video.
+ * struggle more with noise that persists across multiple frames (e.g., from 
+ * compression or duplicate frames), but can work very well on high quality 
+ * video.
  *
  * Motion estimation (ME) should improve quality without impacting speed.
  *
@@ -444,60 +568,51 @@ vec4 hook()
  */
 #ifdef LUMA_raw
 #define SS 0.25
-#define SD vec3(1,1,1)
+#define SD vec3(1,1,1.5)
 #define PST 0
 #define PSS 0.0
 #define PSD vec2(1,1)
 #else
 #define SS 0.25
-#define SD vec3(1,1,1)
+#define SD vec3(1,1,1.5)
 #define PST 0
 #define PSS 0.0
 #define PSD vec2(1,1)
 #endif
 
-/* Extremes preserve
- *
- * Reduces denoising around very bright/dark areas. The downscaling factor of 
- * EP (located near the top of this shader) controls the area sampled for 
- * luminance (higher numbers consider more area).
+// Scaling factor (should match WIDTH/HEIGHT)
+#ifdef LUMA_raw
+#define SF 1
+#else
+#define SF 1
+#endif
+
+/* Estimator
  *
- * EP: 1 to enable, 0 to disable
- * DP: EP strength on dark patches, 0 to fully denoise
- * BP: EP strength on bright patches, 0 to fully denoise
+ * 0: means
+ * 1: Euclidean medians (extremely slow, may be good for heavy noise)
+ * 2: weight map (not a denoiser, maybe useful for generating image masks)
+ * 3: weighted median intensity (slow, may be good for heavy noise)
+ * 4: edge map (based on the relevant AS settings)
  */
 #ifdef LUMA_raw
-#define EP 1
-#define BP 0.75
-#define DP 0.25
+#define M 0
 #else
-#define EP 0
-#define BP 0.0
-#define DP 0.0
+#define M 0
 #endif
 
-/* Robust filtering
- *
- * This setting is dependent on code generation from nlmeans_cfg, so this 
- * setting can only be enabled via nlmeans_cfg.
- *
- * Compares the pixel-of-interest against downscaled pixels.
+/* Difference visualization
  *
- * This will virtually always improve quality, but will always disable 
- * textureGather optimizations.
+ * Visualizes the difference between input/output image
  *
- * The downscale factor can be modified in the WIDTH/HEIGHT directives for the 
- * RF texture (for CHROMA, RGB) and RF_LUMA (LUMA only) textures near the top 
- * of this shader, higher numbers increase blur.
- *
- * Any notation of RF as a positive number should be assumed to be referring to 
- * the downscaling factor, e.g., RF=3 means RF is set to 1 and the downscaling 
- * factor is set to 3.
+ * 0: off
+ * 1: absolute difference scaled by S
+ * 2: difference centered on 0.5
  */
 #ifdef LUMA_raw
-#define RF 1
+#define DV 0
 #else
-#define RF 1
+#define DV 0
 #endif
 
 /* Blur factor
@@ -511,30 +626,11 @@ vec4 hook()
 #define BF 1.0
 #endif
 
-/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
-/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
-/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
-/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
-/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
-
-// Scaling factor (should match WIDTH/HEIGHT)
+// Force disable textureGather
 #ifdef LUMA_raw
-#define SF 1
+#define NG 0
 #else
-#define SF 1
-#endif
-
-/* Estimator
- *
- * 0: means
- * 1: Euclidean medians (extremely slow, may be good for heavy noise)
- * 2: weight map (not a denoiser, maybe useful for generating image masks)
- * 3: weighted median intensity (slow, may be good for heavy noise)
- */
-#ifdef LUMA_raw
-#define M 0
-#else
-#define M 0
+#define NG 0
 #endif
 
 // Patch donut (probably useless)
@@ -544,7 +640,7 @@ vec4 hook()
 #define PD 0
 #endif
 
-// Duplicate 1st weight
+// Duplicate 1st weight (for LGC)
 #ifdef LUMA_raw
 #define D1W 0
 #else
@@ -554,6 +650,7 @@ vec4 hook()
 /* Shader code */
 
 #define EPSILON 0.00000000001
+#define M_PI 3.14159265358979323846
 
 #if PS == 6
 const int hp = P/2;
@@ -603,6 +700,7 @@ const float hr = int(R/2) - 0.5*(1-(R%2)); // sample between pixels for even res
 #define R_AREA(a) (a * T1 + RF-1)
 
 // research shapes
+// XXX would be nice to have the option of temporally-varying research sizes
 #if R == 0 || R == 1
 #define FOR_RESEARCH(r) S_1X1(r)
 const int r_area = R_AREA(1);
@@ -774,11 +872,13 @@ vec4 patch_comparison(vec3 r, vec3 r2)
 	return min_rot * p_scale;
 }
 
-#define NO_GATHER (PD == 0) // never textureGather if any of these conditions are false
+#define NO_GATHER (PD == 0 && NG == 0) // never textureGather if any of these conditions are false
 #define REGULAR_ROTATIONS (RI == 0 || RI == 1 || RI == 3)
 
 #if (defined(LUMA_gather) || D1W) && ((PS == 3 || PS == 7) && P == 3) && PST == 0 && M != 1 && REGULAR_ROTATIONS && NO_GATHER
 // 3x3 diamond/plus patch_comparison_gather
+// XXX extend to support arbitrary sizes (probably requires code generation)
+// XXX extend to support 3x3 square
 const ivec2 offsets[4] = { ivec2(0,-1), ivec2(-1,0), ivec2(0,1), ivec2(1,0) };
 const ivec2 offsets_sf[4] = { ivec2(0,-1) * SF, ivec2(-1,0) * SF, ivec2(0,1) * SF, ivec2(1,0) * SF };
 vec4 poi_patch = gather_offs(0, offsets);
@@ -808,6 +908,7 @@ vec4 patch_comparison_gather(vec3 r, vec3 r2)
 }
 #elif (defined(LUMA_gather) || D1W) && PS == 6 && REGULAR_ROTATIONS && NO_GATHER
 // tiled even square patch_comparison_gather
+// XXX extend to support odd square?
 vec4 patch_comparison_gather(vec3 r, vec3 r2)
 {
 	vec2 tile;
@@ -888,6 +989,7 @@ vec4 hook()
 #endif
 
 	FOR_FRAME(r) {
+	// XXX ME is always a frame behind, should have to option to re-research after applying ME (could do it an arbitrary number of times per frame if desired)
 #if T && ME == 1 // temporal & motion estimation max weight
 	if (r.z > 0) {
 		me += me_tmp;
@@ -958,6 +1060,7 @@ vec4 hook()
 	} // FOR_RESEARCH
 	} // FOR_FRAME
 
+	// XXX optionally put the denoised pixel into the frame buffer?
 #if T // temporal
 #endif
 
@@ -1009,12 +1112,29 @@ vec4 hook()
 	result = sum / total_weight;
 #endif
 
+#if ASW == 0 // pre-WD weights
+#define AS_weight old_avg_weight
+#elif ASW == 1 // post-WD weights
+#define AS_weight avg_weight
+#endif
+
+#if ASK == 0
+	vec4 sharpening_strength = pow(AS_weight, vec4(ASP));
+#elif ASK == 1
+#define sigmoid(x) (tanh(x * 2*M_PI - M_PI)*0.5+0.5)
+	vec4 sharpening_strength = mix(pow(sigmoid(AS_weight), vec4(ASP)),
+	                               AS_weight, ASC);
+	// just in case ASC < 0 (will sharpen but it's janky XXX)
+	sharpening_strength = clamp(sharpening_strength, 0.0, 1.0);
+#elif ASK == 2
+	vec4 sharpening_strength = vec4(ASP);
+#endif
+
+	// XXX maybe allow for alternative blurs? e.g., replace result w/ load2?
 #if AS == 1 // sharpen+denoise
 	vec4 sharpened = result + (poi - result) * ASF;
-	vec4 sharpening_power = pow(avg_weight, vec4(ASP));
 #elif AS == 2 // sharpen only
 	vec4 sharpened = poi + (poi - result) * ASF;
-	vec4 sharpening_power = pow(avg_weight, vec4(ASP));
 #endif
 
 #if EP // extremes preserve
@@ -1025,9 +1145,23 @@ vec4 hook()
 #endif
 
 #if AS == 1 // sharpen+denoise
-	result = mix(sharpened, result, sharpening_power);
+	result = mix(sharpened, result, sharpening_strength);
 #elif AS == 2 // sharpen only
-	result = mix(sharpened, poi, sharpening_power);
+	result = mix(sharpened, poi, sharpening_strength);
+#endif
+
+#if M == 4 // edge map
+	result = sharpening_strength;
+#endif
+
+#if (M == 2 || M == 4) && defined(CHROMA_raw) // drop chroma for weight maps
+	result = vec4(0.5);
+#endif
+
+#if DV == 1
+	result = clamp(abs(poi - result) * S, 0.0, 1.0);
+#elif DV == 2
+	result = (poi - result) * 0.5 + 0.5;
 #endif
 
 	return mix(poi, result, BF);
diff --git a/portable_config/shaders/nlmeans_temporal.glsl b/portable_config/shaders/nlmeans_temporal.glsl
index 3b24553b..f16ec8a4 100644
--- a/portable_config/shaders/nlmeans_temporal.glsl
+++ b/portable_config/shaders/nlmeans_temporal.glsl
@@ -82,11 +82,10 @@
  *   - Reflections may have a significant speed impact
  *
  * Options which always disable textureGather:
- * 	- RF
  * 	- PD
  */
 
-// The following is shader code injected from guided_s.glsl
+// The following is shader code injected from guided.glsl
 /* vi: ft=c
  *
  * Copyright (c) 2022 an3223 <ethanr2048@gmail.com>
@@ -105,25 +104,53 @@
  * along with this program. If not, see <https://www.gnu.org/licenses/>.
  */
 
-//desc: "Self-guided" guided filter
+//desc: Guided filter guided by the downscaled image
 
-/* The radius can be adjusted with the MEANIP stage's downscaling factor. 
+/* The radius can be adjusted with the MEANI stage's downscaling factor. 
  * Higher numbers give a bigger radius.
  *
  * The E variable can be found in the A stage.
  *
- * The subsampling (fast guided filter) can be adjusted with the IP stage's 
+ * The subsampling (fast guided filter) can be adjusted with the I stage's 
  * downscaling factor. Higher numbers are faster.
+ *
+ * The guide's subsampling can be adjusted with the PREI stage's downscaling 
+ * factor. Higher numbers downscale more.
  */
 
 //!HOOK LUMA
 //!HOOK CHROMA
-//!HOOK RGB
-//!DESC Guided filter (IP)
+//!DESC Guided filter (PREI)
 //!BIND HOOKED
+//!WIDTH HOOKED.w 1.25 /
+//!HEIGHT HOOKED.h 1.25 /
+//!SAVE _INJ_PREI
+
+vec4 hook()
+{
+	 return HOOKED_texOff(0); 
+}
+
+//!HOOK LUMA
+//!HOOK CHROMA
+//!DESC Guided filter (I)
+//!BIND _INJ_PREI
 //!WIDTH HOOKED.w 1.0 /
 //!HEIGHT HOOKED.h 1.0 /
-//!SAVE _INJ_IP
+//!SAVE _INJ_I
+
+vec4 hook()
+{
+return _INJ_PREI_texOff(0);
+}
+
+//!HOOK LUMA
+//!HOOK CHROMA
+//!DESC Guided filter (P)
+//!BIND HOOKED
+//!WIDTH _INJ_I.w
+//!HEIGHT _INJ_I.h
+//!SAVE _INJ_P
 
 vec4 hook()
 {
@@ -132,87 +159,124 @@ vec4 hook()
 
 //!HOOK LUMA
 //!HOOK CHROMA
-//!HOOK RGB
-//!DESC Guided filter (MEANIP)
-//!BIND _INJ_IP
-//!WIDTH _INJ_IP.w 2.0 /
-//!HEIGHT _INJ_IP.h 2.0 /
-//!SAVE _INJ_MEANIP
+//!DESC Guided filter (MEANI)
+//!BIND _INJ_I
+//!WIDTH _INJ_I.w 1.5 /
+//!HEIGHT _INJ_I.h 1.5 /
+//!SAVE _INJ_MEANI
 
 vec4 hook()
 {
-return _INJ_IP_texOff(0);
+return _INJ_I_texOff(0);
 }
 
 //!HOOK LUMA
 //!HOOK CHROMA
-//!HOOK RGB
-//!DESC Guided filter (_INJ_IP_SQ)
-//!BIND _INJ_IP
-//!WIDTH _INJ_IP.w
-//!HEIGHT _INJ_IP.h
-//!SAVE _INJ_IP_SQ
+//!DESC Guided filter (MEANP)
+//!BIND _INJ_P
+//!WIDTH _INJ_MEANI.w
+//!HEIGHT _INJ_MEANI.h
+//!SAVE _INJ_MEANP
 
 vec4 hook()
 {
-return _INJ_IP_texOff(0) * _INJ_IP_texOff(0);
+return _INJ_P_texOff(0);
 }
 
 //!HOOK LUMA
 //!HOOK CHROMA
-//!HOOK RGB
-//!DESC Guided filter (CORRIP)
-//!BIND _INJ_IP_SQ
-//!WIDTH _INJ_MEANIP.w
-//!HEIGHT _INJ_MEANIP.h
-//!SAVE _INJ_CORRIP
+//!DESC Guided filter (_INJ_I_SQ)
+//!BIND _INJ_I
+//!WIDTH _INJ_I.w
+//!HEIGHT _INJ_I.h
+//!SAVE _INJ_I_SQ
 
 vec4 hook()
 {
-return _INJ_IP_SQ_texOff(0);
+return _INJ_I_texOff(0) * _INJ_I_texOff(0);
+}
+
+//!HOOK LUMA
+//!HOOK CHROMA
+//!DESC Guided filter (_INJ_IXP)
+//!BIND _INJ_I
+//!BIND _INJ_P
+//!WIDTH _INJ_I.w
+//!HEIGHT _INJ_I.h
+//!SAVE _INJ_IXP
+
+vec4 hook()
+{
+return _INJ_I_texOff(0) * _INJ_P_texOff(0);
+}
+
+//!HOOK LUMA
+//!HOOK CHROMA
+//!DESC Guided filter (CORRI)
+//!BIND _INJ_I_SQ
+//!WIDTH _INJ_MEANI.w
+//!HEIGHT _INJ_MEANI.h
+//!SAVE _INJ_CORRI
+
+vec4 hook()
+{
+return _INJ_I_SQ_texOff(0);
+}
+
+//!HOOK LUMA
+//!HOOK CHROMA
+//!DESC Guided filter (CORRP)
+//!BIND _INJ_IXP
+//!WIDTH _INJ_MEANI.w
+//!HEIGHT _INJ_MEANI.h
+//!SAVE _INJ_CORRP
+
+vec4 hook()
+{
+return _INJ_IXP_texOff(0);
 }
 
 //!HOOK LUMA
 //!HOOK CHROMA
-//!HOOK RGB
 //!DESC Guided filter (A)
-//!BIND _INJ_MEANIP
-//!BIND _INJ_CORRIP
-//!WIDTH _INJ_IP.w
-//!HEIGHT _INJ_IP.h
+//!BIND _INJ_MEANI
+//!BIND _INJ_MEANP
+//!BIND _INJ_CORRI
+//!BIND _INJ_CORRP
+//!WIDTH _INJ_I.w
+//!HEIGHT _INJ_I.h
 //!SAVE _INJ_A
 
-#define E 0.001
+#define E 0.0013
 
 vec4 hook()
 {
-vec4 var = _INJ_CORRIP_texOff(0) - _INJ_MEANIP_texOff(0) * _INJ_MEANIP_texOff(0);
-	 vec4 cov = var; 
+vec4 var = _INJ_CORRI_texOff(0) - _INJ_MEANI_texOff(0) * _INJ_MEANI_texOff(0);
+vec4 cov = _INJ_CORRP_texOff(0) - _INJ_MEANI_texOff(0) * _INJ_MEANP_texOff(0);
 	 return cov / (var + E); 
 }
 
 //!HOOK LUMA
 //!HOOK CHROMA
-//!HOOK RGB
 //!DESC Guided filter (B)
 //!BIND _INJ_A
-//!BIND _INJ_MEANIP
-//!WIDTH _INJ_IP.w
-//!HEIGHT _INJ_IP.h
+//!BIND _INJ_MEANI
+//!BIND _INJ_MEANP
+//!WIDTH _INJ_I.w
+//!HEIGHT _INJ_I.h
 //!SAVE _INJ_B
 
 vec4 hook()
 {
-return _INJ_MEANIP_texOff(0) - _INJ_A_texOff(0) * _INJ_MEANIP_texOff(0);
+return _INJ_MEANP_texOff(0) - _INJ_A_texOff(0) * _INJ_MEANI_texOff(0);
 }
 
 //!HOOK LUMA
 //!HOOK CHROMA
-//!HOOK RGB
 //!DESC Guided filter (MEANA)
 //!BIND _INJ_A
-//!WIDTH _INJ_MEANIP.w
-//!HEIGHT _INJ_MEANIP.h
+//!WIDTH _INJ_MEANI.w
+//!HEIGHT _INJ_MEANI.h
 //!SAVE _INJ_MEANA
 
 vec4 hook()
@@ -222,11 +286,10 @@ return _INJ_A_texOff(0);
 
 //!HOOK LUMA
 //!HOOK CHROMA
-//!HOOK RGB
 //!DESC Guided filter (MEANB)
 //!BIND _INJ_B
-//!WIDTH _INJ_MEANIP.w
-//!HEIGHT _INJ_MEANIP.h
+//!WIDTH _INJ_MEANI.w
+//!HEIGHT _INJ_MEANI.h
 //!SAVE _INJ_MEANB
 
 vec4 hook()
@@ -236,7 +299,6 @@ return _INJ_B_texOff(0);
 
 //!HOOK LUMA
 //!HOOK CHROMA
-//!HOOK RGB
 //!DESC Guided filter
 //!BIND HOOKED
 //!BIND _INJ_MEANA
@@ -248,10 +310,9 @@ vec4 hook()
 return _INJ_MEANA_texOff(0) * HOOKED_texOff(0) + _INJ_MEANB_texOff(0);
 }
 
-// End of source code injected from guided_s.glsl
+// End of source code injected from guided.glsl
 //!HOOK LUMA
 //!HOOK CHROMA
-//!HOOK RGB
 //!DESC Non-local means (downscale)
 //!WIDTH LUMA.w 3 /
 //!HEIGHT LUMA.h 3 /
@@ -265,7 +326,6 @@ vec4 hook()
 
 //!HOOK LUMA
 //!HOOK CHROMA
-//!HOOK RGB
 //!DESC Non-local means (share)
 //!BIND RF_LUMA
 //!SAVE RF
@@ -277,7 +337,6 @@ vec4 hook()
 
 //!HOOK LUMA
 //!HOOK CHROMA
-//!HOOK RGB
 //!BIND HOOKED
 //!BIND RF_LUMA
 //!BIND EP
@@ -307,11 +366,10 @@ vec4 hook()
  *
  * Even-numbered patch/research sizes will sample between pixels unless PS=6. 
  * It's not known whether this is ever useful behavior or not. This is 
- * incompatible with textureGather optimizations, so enable RF when using even 
- * patch/research sizes.
+ * incompatible with textureGather optimizations, so NG=1 to disable them.
  */
 #ifdef LUMA_raw
-#define S 2.25
+#define S 20.0
 #define P 3
 #define R 5
 #else
@@ -322,24 +380,37 @@ vec4 hook()
 
 /* Adaptive sharpening
  *
- * Uses the blur incurred by denoising plus the weight map to perform an 
- * unsharp mask that gets applied most strongly to edges.
+ * Uses the blur incurred by denoising to perform an unsharp mask, and uses the 
+ * weight map to restrict the sharpening to edges.
  *
- * Sharpening will amplify noise, so the denoising factor (S) should usually be 
- * increased to compensate.
+ * Use M=4 to get a good look at which areas are/aren't sharpened.
  *
  * AS: 2 for sharpening, 1 for sharpening+denoising, 0 to disable
  * ASF: Sharpening factor, higher numbers make a sharper underlying image
  * ASP: Weight power, higher numbers use more of the sharp image
+ * ASW:
+ * 	- 0 to use pre-WD weights
+ * 	- 1 to use post-WD weights (ASP should be ~2x to compensate)
+ * ASK: Weight kernel:
+ * 	- 0 for power. This is the old method.
+ * 	- 1 for sigmoid. This is generally recommended.
+ * 	- 2 for constant (non-adaptive, w/ ASP=0 this sharpens the entire image)
+ * ASC (only for ASK=1, range 0-1): Reduces the contrast of the edge map
  */
 #ifdef LUMA_raw
 #define AS 0
-#define ASF 1.0
-#define ASP 2.0
+#define ASF 2.0
+#define ASP 32.0
+#define ASW 0
+#define ASK 1
+#define ASC 0.0
 #else
 #define AS 0
-#define ASF 1.0
-#define ASP 2.0
+#define ASF 2.0
+#define ASP 32.0
+#define ASW 0
+#define ASK 1
+#define ASC 0.0
 #endif
 
 /* Starting weight
@@ -348,8 +419,8 @@ vec4 hook()
  * handle higher noise levels, ringing, and may be useful for other things too?
  *
  * EPSILON should be used instead of zero to avoid divide-by-zero errors. The 
- * avg_weight variable may be used to make SW adapt to the local noise level, 
- * e.g., SW=max(avg_weight, EPSILON)
+ * avg_weight/old_avg_weight variables may be used to make SW adapt to the 
+ * local noise level, e.g., SW=max(avg_weight, EPSILON)
  */
 #ifdef LUMA_raw
 #define SW 1.0
@@ -369,7 +440,7 @@ vec4 hook()
  * 	- 0: Disable
  *
  * WDT: Threshold coefficient, higher numbers discard more
- * WDP (WD=1): Higher numbers reduce the threshold more for small sample sizes
+ * WDP (only for WD=1): Increasing reduces the threshold for small sample sizes
  */
 #ifdef LUMA_raw
 #define WD 1
@@ -381,6 +452,49 @@ vec4 hook()
 #define WDP 6.0
 #endif
 
+/* Extremes preserve
+ *
+ * Reduces denoising around very bright/dark areas. The downscaling factor of 
+ * EP (located near the top of this shader) controls the area sampled for 
+ * luminance (higher numbers consider more area).
+ *
+ * This is incompatible with RGB. If you have RGB hooks enabled then you will 
+ * have to delete the EP shader stage or specify EP=0 through nlmeans_cfg.
+ *
+ * EP: 1 to enable, 0 to disable
+ * DP: EP strength on dark patches, 0 to fully denoise
+ * BP: EP strength on bright patches, 0 to fully denoise
+ */
+#ifdef LUMA_raw
+#define EP 1
+#define BP 0.75
+#define DP 0.25
+#else
+#define EP 0
+#define BP 0.0
+#define DP 0.0
+#endif
+
+/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
+/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
+/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
+/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
+/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
+
+/* Robust filtering
+ *
+ * This setting is dependent on code generation from nlmeans_cfg, so this 
+ * setting can only be enabled via nlmeans_cfg.
+ *
+ * Compares the pixel-of-interest against a guide, which could be a downscaled 
+ * image or the output of another shader such as guided.glsl
+ */
+#ifdef LUMA_raw
+#define RF 1
+#else
+#define RF 1
+#endif
+
 /* Search shape
  *
  * Determines the shape of patches and research zones. Different shapes have 
@@ -389,6 +503,8 @@ vec4 hook()
  *
  * PS applies applies to patches, RS applies to research zones.
  *
+ * Be wary of gather optimizations (see the Regarding Speed comment at the top)
+ *
  * 0: square (symmetrical)
  * 1: horizontal line (asymmetric)
  * 2: vertical line (asymmetric)
@@ -435,8 +551,9 @@ vec4 hook()
  * 	- Buggy
  *
  * Gather samples across multiple frames. May cause motion blur and may 
- * struggle more with noise that persists across multiple frames (compression 
- * noise, repeating frames), but can work very well on high quality video.
+ * struggle more with noise that persists across multiple frames (e.g., from 
+ * compression or duplicate frames), but can work very well on high quality 
+ * video.
  *
  * Motion estimation (ME) should improve quality without impacting speed.
  *
@@ -471,60 +588,51 @@ vec4 hook()
  */
 #ifdef LUMA_raw
 #define SS 0.25
-#define SD vec3(1,1,1)
+#define SD vec3(1,1,1.5)
 #define PST 0
 #define PSS 0.0
 #define PSD vec2(1,1)
 #else
 #define SS 0.25
-#define SD vec3(1,1,1)
+#define SD vec3(1,1,1.5)
 #define PST 0
 #define PSS 0.0
 #define PSD vec2(1,1)
 #endif
 
-/* Extremes preserve
- *
- * Reduces denoising around very bright/dark areas. The downscaling factor of 
- * EP (located near the top of this shader) controls the area sampled for 
- * luminance (higher numbers consider more area).
+// Scaling factor (should match WIDTH/HEIGHT)
+#ifdef LUMA_raw
+#define SF 1
+#else
+#define SF 1
+#endif
+
+/* Estimator
  *
- * EP: 1 to enable, 0 to disable
- * DP: EP strength on dark patches, 0 to fully denoise
- * BP: EP strength on bright patches, 0 to fully denoise
+ * 0: means
+ * 1: Euclidean medians (extremely slow, may be good for heavy noise)
+ * 2: weight map (not a denoiser, maybe useful for generating image masks)
+ * 3: weighted median intensity (slow, may be good for heavy noise)
+ * 4: edge map (based on the relevant AS settings)
  */
 #ifdef LUMA_raw
-#define EP 1
-#define BP 0.75
-#define DP 0.25
+#define M 0
 #else
-#define EP 0
-#define BP 0.0
-#define DP 0.0
+#define M 0
 #endif
 
-/* Robust filtering
- *
- * This setting is dependent on code generation from nlmeans_cfg, so this 
- * setting can only be enabled via nlmeans_cfg.
+/* Difference visualization
  *
- * Compares the pixel-of-interest against downscaled pixels.
+ * Visualizes the difference between input/output image
  *
- * This will virtually always improve quality, but will always disable 
- * textureGather optimizations.
- *
- * The downscale factor can be modified in the WIDTH/HEIGHT directives for the 
- * RF texture (for CHROMA, RGB) and RF_LUMA (LUMA only) textures near the top 
- * of this shader, higher numbers increase blur.
- *
- * Any notation of RF as a positive number should be assumed to be referring to 
- * the downscaling factor, e.g., RF=3 means RF is set to 1 and the downscaling 
- * factor is set to 3.
+ * 0: off
+ * 1: absolute difference scaled by S
+ * 2: difference centered on 0.5
  */
 #ifdef LUMA_raw
-#define RF 1
+#define DV 0
 #else
-#define RF 1
+#define DV 0
 #endif
 
 /* Blur factor
@@ -538,30 +646,11 @@ vec4 hook()
 #define BF 1.0
 #endif
 
-/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
-/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
-/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
-/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
-/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
-
-// Scaling factor (should match WIDTH/HEIGHT)
-#ifdef LUMA_raw
-#define SF 1
-#else
-#define SF 1
-#endif
-
-/* Estimator
- *
- * 0: means
- * 1: Euclidean medians (extremely slow, may be good for heavy noise)
- * 2: weight map (not a denoiser, maybe useful for generating image masks)
- * 3: weighted median intensity (slow, may be good for heavy noise)
- */
+// Force disable textureGather
 #ifdef LUMA_raw
-#define M 0
+#define NG 0
 #else
-#define M 0
+#define NG 0
 #endif
 
 // Patch donut (probably useless)
@@ -571,7 +660,7 @@ vec4 hook()
 #define PD 0
 #endif
 
-// Duplicate 1st weight
+// Duplicate 1st weight (for LGC)
 #ifdef LUMA_raw
 #define D1W 0
 #else
@@ -581,6 +670,7 @@ vec4 hook()
 /* Shader code */
 
 #define EPSILON 0.00000000001
+#define M_PI 3.14159265358979323846
 
 #if PS == 6
 const int hp = P/2;
@@ -630,6 +720,7 @@ const float hr = int(R/2) - 0.5*(1-(R%2)); // sample between pixels for even res
 #define R_AREA(a) (a * T1 + RF-1)
 
 // research shapes
+// XXX would be nice to have the option of temporally-varying research sizes
 #if R == 0 || R == 1
 #define FOR_RESEARCH(r) S_1X1(r)
 const int r_area = R_AREA(1);
@@ -807,11 +898,13 @@ vec4 patch_comparison(vec3 r, vec3 r2)
 	return min_rot * p_scale;
 }
 
-#define NO_GATHER (PD == 0) // never textureGather if any of these conditions are false
+#define NO_GATHER (PD == 0 && NG == 0) // never textureGather if any of these conditions are false
 #define REGULAR_ROTATIONS (RI == 0 || RI == 1 || RI == 3)
 
 #if (defined(LUMA_gather) || D1W) && ((PS == 3 || PS == 7) && P == 3) && PST == 0 && M != 1 && REGULAR_ROTATIONS && NO_GATHER
 // 3x3 diamond/plus patch_comparison_gather
+// XXX extend to support arbitrary sizes (probably requires code generation)
+// XXX extend to support 3x3 square
 const ivec2 offsets[4] = { ivec2(0,-1), ivec2(-1,0), ivec2(0,1), ivec2(1,0) };
 const ivec2 offsets_sf[4] = { ivec2(0,-1) * SF, ivec2(-1,0) * SF, ivec2(0,1) * SF, ivec2(1,0) * SF };
 vec4 poi_patch = gather_offs(0, offsets);
@@ -841,6 +934,7 @@ vec4 patch_comparison_gather(vec3 r, vec3 r2)
 }
 #elif (defined(LUMA_gather) || D1W) && PS == 6 && REGULAR_ROTATIONS && NO_GATHER
 // tiled even square patch_comparison_gather
+// XXX extend to support odd square?
 vec4 patch_comparison_gather(vec3 r, vec3 r2)
 {
 	vec2 tile;
@@ -921,6 +1015,7 @@ vec4 hook()
 #endif
 
 	FOR_FRAME(r) {
+	// XXX ME is always a frame behind, should have to option to re-research after applying ME (could do it an arbitrary number of times per frame if desired)
 #if T && ME == 1 // temporal & motion estimation max weight
 	if (r.z > 0) {
 		me += me_tmp;
@@ -991,6 +1086,7 @@ vec4 hook()
 	} // FOR_RESEARCH
 	} // FOR_FRAME
 
+	// XXX optionally put the denoised pixel into the frame buffer?
 #if T // temporal
 	imageStore(PREV3, ivec2(HOOKED_pos*imageSize(PREV3)), load2(vec3(0,0,2)));
 	imageStore(PREV2, ivec2(HOOKED_pos*imageSize(PREV2)), load2(vec3(0,0,1)));
@@ -1045,12 +1141,29 @@ vec4 hook()
 	result = sum / total_weight;
 #endif
 
+#if ASW == 0 // pre-WD weights
+#define AS_weight old_avg_weight
+#elif ASW == 1 // post-WD weights
+#define AS_weight avg_weight
+#endif
+
+#if ASK == 0
+	vec4 sharpening_strength = pow(AS_weight, vec4(ASP));
+#elif ASK == 1
+#define sigmoid(x) (tanh(x * 2*M_PI - M_PI)*0.5+0.5)
+	vec4 sharpening_strength = mix(pow(sigmoid(AS_weight), vec4(ASP)),
+	                               AS_weight, ASC);
+	// just in case ASC < 0 (will sharpen but it's janky XXX)
+	sharpening_strength = clamp(sharpening_strength, 0.0, 1.0);
+#elif ASK == 2
+	vec4 sharpening_strength = vec4(ASP);
+#endif
+
+	// XXX maybe allow for alternative blurs? e.g., replace result w/ load2?
 #if AS == 1 // sharpen+denoise
 	vec4 sharpened = result + (poi - result) * ASF;
-	vec4 sharpening_power = pow(avg_weight, vec4(ASP));
 #elif AS == 2 // sharpen only
 	vec4 sharpened = poi + (poi - result) * ASF;
-	vec4 sharpening_power = pow(avg_weight, vec4(ASP));
 #endif
 
 #if EP // extremes preserve
@@ -1061,9 +1174,23 @@ vec4 hook()
 #endif
 
 #if AS == 1 // sharpen+denoise
-	result = mix(sharpened, result, sharpening_power);
+	result = mix(sharpened, result, sharpening_strength);
 #elif AS == 2 // sharpen only
-	result = mix(sharpened, poi, sharpening_power);
+	result = mix(sharpened, poi, sharpening_strength);
+#endif
+
+#if M == 4 // edge map
+	result = sharpening_strength;
+#endif
+
+#if (M == 2 || M == 4) && defined(CHROMA_raw) // drop chroma for weight maps
+	result = vec4(0.5);
+#endif
+
+#if DV == 1
+	result = clamp(abs(poi - result) * S, 0.0, 1.0);
+#elif DV == 2
+	result = (poi - result) * 0.5 + 0.5;
 #endif
 
 	return mix(poi, result, BF);
diff --git a/portable_config/shaders/unsharpMask_next.glsl b/portable_config/shaders/unsharpMask_next.glsl
new file mode 100644
index 00000000..3b6b9a2b
--- /dev/null
+++ b/portable_config/shaders/unsharpMask_next.glsl
@@ -0,0 +1,74 @@
+//!HOOK MAIN
+//!BIND HOOKED
+//!SAVE PASS0
+//!DESC unsharp mask pass0
+
+vec4 hook() {
+    return linearize(textureLod(HOOKED_raw, HOOKED_pos, 0.0) * HOOKED_mul);
+}
+
+//!HOOK MAIN
+//!BIND PASS0
+//!SAVE PASS1
+//!DESC unsharp mask pass1
+
+////////////////////////////////////////////////////////////////////////
+// USER CONFIGURABLE, PASS 1 (blur in y axis)
+//
+// CAUTION! probably should use the same settings for "USER CONFIGURABLE, PASS 2" below
+//
+#define SIGMA 1.0 //blur spread or amount, (0.0, 10+]
+#define RADIUS 3.0 //kernel radius (integer as float, e.g. 3.0), (0.0, 10+]; probably should set it to ceil(3 * SIGMA)
+//
+////////////////////////////////////////////////////////////////////////
+
+#define get_weight(x) (exp(-x * x / (2.0 * SIGMA * SIGMA)))
+
+vec4 hook() {
+    float weight;
+    vec4 csum = textureLod(PASS0_raw, PASS0_pos, 0.0) * PASS0_mul;
+    float wsum = 1.0;
+    for(float i = 1.0; i <= RADIUS; ++i) {
+        weight = get_weight(i);
+        csum += (textureLod(PASS0_raw, PASS0_pos + PASS0_pt * vec2(0.0, -i), 0.0) + textureLod(PASS0_raw, PASS0_pos + PASS0_pt * vec2(0.0, i), 0.0)) * PASS0_mul * weight;
+        wsum += 2.0 * weight;
+    }
+    return csum / wsum;
+}
+
+//!HOOK MAIN
+//!BIND PASS0
+//!BIND PASS1
+//!DESC unsharp mask pass2
+
+////////////////////////////////////////////////////////////////////////
+// USER CONFIGURABLE, PASS 2 (blur in x axis and aply unsharp mask)
+//
+// CAUTION! probably should use the same settings for "USER CONFIGURABLE, PASS 1" above
+//
+#define SIGMA 1.0 //blur spread or amount, (0.0, 10+]
+#define RADIUS 3.0 //kernel radius (integer as float, e.g. 3.0), (0.0, 10+]; probably should set it to ceil(3 * SIGMA)
+//
+//sharpnes
+#define AMOUNT 0.5 //amount of sharpening [0.0, 10+]
+#define THRESHOLD 0.0 //sets the minimum contrast for sharpening (e.g. 0.1), [0.0, 1.0]
+//
+////////////////////////////////////////////////////////////////////////
+
+#define get_weight(x) (exp(-x * x / (2.0 * SIGMA * SIGMA)))
+
+vec4 hook() {
+    float weight;
+    vec4 csum = textureLod(PASS1_raw, PASS1_pos, 0.0) * PASS1_mul;
+    float wsum = 1.0;
+    for(float i = 1.0; i <= RADIUS; ++i) {
+        weight = get_weight(i);
+        csum += (textureLod(PASS1_raw, PASS1_pos + PASS1_pt * vec2(-i, 0.0), 0.0) + textureLod(PASS1_raw, PASS1_pos + PASS1_pt * vec2(i, 0.0), 0.0)) * PASS1_mul * weight;
+        wsum += 2.0 * weight;
+    }
+    vec4 original = textureLod(PASS0_raw, PASS0_pos, 0.0) * PASS0_mul;
+    vec4 mask = original - csum / wsum;
+    if (abs(mask.r) > THRESHOLD || abs(mask.g) > THRESHOLD || abs(mask.b) > THRESHOLD)
+        return delinearize(original + mask * AMOUNT);
+    return delinearize(original);
+}