Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[http] fix parsing link header #1924

Merged
merged 2 commits into from
Jun 20, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 41 additions & 4 deletions visidata/loaders/http.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import re

from visidata import Path, RepeatFile, vd, VisiData
from visidata.loaders.tsv import splitter

Expand Down Expand Up @@ -47,24 +49,59 @@ def _iter_lines(path=path, response=response, max_next=vd.options.http_max_next)
linkhdr = response.getheader('Link')
src = None
if linkhdr:
links = urllib.parse.parse_header(linkhdr)
src = links.get('next', {}).get('url', None)
links = parse_header_links(linkhdr)
link_data = {}
for link in links:
key = link.get('rel') or link.get('url')
link_data[key] = link
src = link_data.get('next', {}).get('url', None)

if not src:
break

n += 1
if n > max_next:
vd.warning(f'stopping at max {max_next} pages')
vd.warning(f'stopping at max next pages: {max_next} pages')
break

vd.status(f'fetching next page from {src}')
response = requests.get(src, stream=True, **vd.options.getall('http_req_'))
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hm, this is weird, were we still trying to use the requests library without importing it?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, there had previously been two lines with requests.get(), but only the first one had been changed to use urllib.request.urlopen().

req = urllib.request.Request(src, **vd.options.getall('http_req_'))
response = urllib.request.urlopen(req)

# add resettable iterator over contents as an already-open fp
path.fptext = RepeatFile(_iter_lines())

return vd.openSource(path, filetype=filetype)

def parse_header_links(link_header):
'''Return a list of dictionaries:
[{'url': 'https://example.com/content?page=1', 'rel': 'prev'},
{'url': 'https://example.com/content?page=3', 'rel': 'next'}]
Takes a link header string, of the form
'<https://example.com/content?page=1>; rel="prev", <https://example.com/content?page=3>; rel="next"'
See https://datatracker.ietf.org/doc/html/rfc8288#section-3
'''

links = []
quote_space = ' \'"'
link_header = link_header.strip(quote_space)
if not link_header: return []
for link_value in re.split(', *<', link_header):
if ';' in link_value:
url, params = link_value.split(';', maxsplit=1)
else:
url, params = link_value, ''
link = {'url': url.strip('<>' + quote_space)}

for param in params.split(';'):
if '=' in param:
key, value = param.split('=')
key = key.strip(quote_space)
value = value.strip(quote_space)
link[key] = value
else:
break
links.append(link)
return links

VisiData.openurl_https = VisiData.openurl_http
2 changes: 1 addition & 1 deletion visidata/plugins.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import shutil
import importlib
import subprocess
import urllib
import urllib.error

from visidata import VisiData, vd, Path, CellColorizer, JsonLinesSheet, AttrDict, Column, Progress, ExpectedException, BaseSheet, asyncsingle, asyncthread

Expand Down