Skip to content

Commit

Permalink
Fix issue #519 Error: Image size couldn't be retrieved
Browse files Browse the repository at this point in the history
  • Loading branch information
thomas694 committed Apr 14, 2024
1 parent 6db5ab5 commit 85637ec
Showing 1 changed file with 5 additions and 0 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ namespace TumblThree.Applications.Crawler
public abstract class AbstractTumblrCrawler : AbstractCrawler
{
private static readonly Regex extractJsonFromPage = new Regex("window\\['___INITIAL_STATE___'] = ({.*});");
private static readonly Regex extractJsonFromPage2 = new Regex("id=\"___INITIAL_STATE___\">\\s*?({.*})\\s*?</script>", RegexOptions.Singleline);
private static readonly Regex extractImageLink = new Regex("<img class=\"\\w+?\" src=\"([^\"]+?)\" alt=\"[^\"]+?\"/>");
private static readonly Regex extractImageSize = new Regex("/s(\\d+?)x(\\d+?)[^/]*?/");

Expand Down Expand Up @@ -475,6 +476,10 @@ protected string RetrieveOriginalImageUrl(string url, int width, int height, boo
{
var extracted = extractJsonFromPage.Match(pageContent).Groups[1].Value;
extracted = new Regex("/.*/").Replace(extracted, "\"\"");
if (string.IsNullOrEmpty(extracted))
{
extracted = extractJsonFromPage2.Match(pageContent).Groups[1].Value;
}
ImageResponse imgRsp = DeserializeImageResponse(extracted);
int maxWidth = imgRsp.Images.Max(x => x.Width);
Image img = imgRsp.Images.FirstOrDefault(x => x.Width == maxWidth);
Expand Down

0 comments on commit 85637ec

Please sign in to comment.