From b50db2e7133b554d594097d8fe81caaec2b9562f Mon Sep 17 00:00:00 2001 From: Juan Carlos Niebles Date: Sun, 26 Jan 2025 18:35:13 -0800 Subject: [PATCH] fixed indexing of external posts (#2983) This should fix several issues with indexing external posts, including #1828. In short, I found that the issue with indexing was that the index builder was receiving 'empty' documents. To fix that, I'm setting the document content to be the post content as retrieved from the rss feed or the text extracted from the external page. I've tested with various blog sources and it seems to be working as expected now. --- _plugins/external-posts.rb | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/_plugins/external-posts.rb b/_plugins/external-posts.rb index 41a6c4360657..1f66a8c853f8 100644 --- a/_plugins/external-posts.rb +++ b/_plugins/external-posts.rb @@ -62,6 +62,7 @@ def create_document(site, source_name, url, content) doc.data['description'] = content[:summary] doc.data['date'] = content[:published] doc.data['redirect'] = url + doc.content = content[:content] site.collections['posts'].docs << doc end @@ -90,8 +91,12 @@ def fetch_content_from_url(url) parsed_html = Nokogiri::HTML(html) title = parsed_html.at('head title')&.text.strip || '' - description = parsed_html.at('head meta[name="description"]')&.attr('content') || '' - body_content = parsed_html.at('body')&.inner_html || '' + description = parsed_html.at('head meta[name="description"]')&.attr('content') + description ||= parsed_html.at('head meta[name="og:description"]')&.attr('content') + description ||= parsed_html.at('head meta[property="og:description"]')&.attr('content') + + body_content = parsed_html.search('p').map { |e| e.text } + body_content = body_content.join() || '' { title: title,