Skip to content
This repository was archived by the owner on Jan 15, 2022. It is now read-only.

Update filter with top 500 sites from Alexa? #20

Closed
pdehaan opened this issue Sep 8, 2016 · 2 comments
Closed

Update filter with top 500 sites from Alexa? #20

pdehaan opened this issue Sep 8, 2016 · 2 comments

Comments

@pdehaan
Copy link

pdehaan commented Sep 8, 2016

Running a quick scan of sites in your NSFW filter versus the Alexa Top 500 (http://www.alexa.com/topsites/category/Top/Adult), and getting about 33% success rate. Not sure how recent our filter file is, or if it is ever updated, or if we want to add new links.

Steps to reproduce:

const urlParse = require('url').parse;

const fetch = require('node-fetch');
const alexa = require('alexa-top-sites');

const promises = [
  getNSFWFilter(),
  getAlexaByCategory('Adult')
];

Promise.all(promises)
  .then(([nsfwFilter, alexaSites]) => {
    const results = alexaSites.filter((site) => !nsfwFilter.includes(site));
    const pct = (results.length / alexaSites.length * 100).toFixed(0);
    const summary = `${results.length} of ${alexaSites.length} (${pct}%) results not found in master list.`;
    return {summary, results};
  })
  .then((results) => console.log(JSON.stringify(results, null, 2)))
  .catch((err) => console.error(err));


function getNSFWFilter() {
  return fetch('https://raw.githubusercontent.com/mozilla/heatmap/master/pornfilter/nofap.txt')
    .then((res) => res.text())
    .then((list) => list.split('\n').filter((item) => !(/^#/).test(item)));
}

function getAlexaByCategory(category) {
  return alexa.byCategory(category)
    .then((list) => list.sites.map((site) => urlParse(site).host));
}

OUTPUT:

{
  "summary": "17 of 25 (68%) results not found in master list.",
  "results": [
    "livejasmin.com",
    "g.e-hentai.org",
    "nudevista.com",
    "fetlife.com",
    "nhentai.net",
    "literotica.com",
    "furaffinity.net",
    "freeones.com",
    "adam4adam.com",
    "newgrounds.com",
    "clips4sale.com",
    "ebaumsworld.com",
    "manhunt.net",
    "luscious.net",
    "mrskin.com",
    "hentai-foundry.com",
    "digitalplayground.com"
  ]
}

NOTE: My lame alexa-top-sites module only scrapes the front page of the http://www.alexa.com/topsites/category/Top/ pages, so only checks the first 25 results and not all 20 pages (500 results).

@pdehaan
Copy link
Author

pdehaan commented Sep 10, 2016

I updated my alexa-top-sites scraper to support pagination so I could grab all 500 results, and Alexa seems to have a large number of results not found in the heatmap list.

"summary": "464 of 500 (93%) results not found in master list.",

Let me know if you want me to try submit a PR to add these to nofap, or if you'd rather I just submit a separate file.


New version of Alexa scraper (with support for crude local caching) is:

const fs = require('fs');
const urlParse = require('url').parse;

const fetch = require('node-fetch');
const alexa = require('alexa-top-sites');

const promises = [
  getNSFWFilter(),
  getAlexaByCategory('Adult')
];

Promise.all(promises)
  .then(([nsfwFilter, alexaSites]) => {
    const results = alexaSites.filter((site) => !nsfwFilter.includes(site));
    const pct = (results.length / alexaSites.length * 100).toFixed(0);
    const summary = `${results.length} of ${alexaSites.length} (${pct}%) results not found in master list.`;
    return {summary, results};
  })
  .then((results) => console.log(JSON.stringify(results, null, 2)))
  .catch((err) => console.error(err));


function getNSFWFilter() {
  const cacheName = './nsfwfilter.cache.json';

  try {
    return require(cacheName);
  } catch (err) {
    console.error('cache not found: %s', cacheName);
    console.error(err);
  }

  return fetch('https://raw.githubusercontent.com/mozilla/heatmap/master/pornfilter/nofap.txt')
    .then((res) => res.text())
    .then((list) => list.split('\n').filter((item) => !(/^#/).test(item)))
    .then((list) => {
      fs.writeFileSync(cacheName, JSON.stringify(list, null, 2));
      return list;
    });
}

function getAlexaByCategory(category) {
  const cacheName = `./alexa-${category}.cache.json`;

  try {
    return require(cacheName);
  } catch (err) {
    console.error('cache not found: %s', cacheName);
    console.error(err);
  }

  return alexa.getPages(alexa.byCategory, category, 20)
    .then((list) => list.map((site) => urlParse(site).host))
    .then((list) => {
      fs.writeFileSync(cacheName, JSON.stringify(list, null, 2));
      return list;
    });
}

@crankycoder
Copy link

@pdehaan I've added an issue over at : crankycoder/nopornjs#1 which is where the bloomfilter is generated.

Sign up for free to subscribe to this conversation on GitHub. Already have an account? Sign in.
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants