diff --git a/.ipynb_checkpoints/scrape-checkpoint.ipynb b/.ipynb_checkpoints/scrape-checkpoint.ipynb new file mode 100644 index 0000000..fea8737 --- /dev/null +++ b/.ipynb_checkpoints/scrape-checkpoint.ipynb @@ -0,0 +1,163 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "def retrieve_from_web(url, user_agent, fname):\n", + " request = urllib.request.Request(url, headers = {'User-Agent': user_agent})\n", + " response = urllib.request.urlopen(request)\n", + " html = response.read()\n", + " fname = '/home/ashutosh/Desktop/WebCrawler/HTML/' + fname\n", + " fp = open(fname, 'wb')\n", + " fp.write(html)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "def read_html():\n", + " fp = open('/home/ashutosh/Desktop/WebCrawler/HTML/medium_html', 'r')\n", + " buff = fp.read()\n", + " return buff" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "#from urllib.request import urlopen\n", + "import urllib.request\n", + "from bs4 import BeautifulSoup\n", + "user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36'\n", + "url = 'https://medium.freecodecamp.org/'\n", + "#retrieve_from_web(url, user_agent, 'medium_html')\n", + "buff = read_html()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "
    \n", + "
  1. The Mobile App Launch Checklist — How to Ship Apps Like a Boss
  2. \n", + "
  3. How To Master Async/Await With This Real World Example
  4. \n", + "
  5. Here are some super secret VS Code hacks to boost your productivity
  6. \n", + "
  7. Removing JavaScript’s “this” keyword makes it a better language. Here’s why.
  8. \n", + "
  9. A chaotic mind leads to chaotic code
  10. \n", + "
  11. I know nothing, but it is okay
  12. \n", + "
  13. Which Programming Language Should You Learn Next?
  14. \n", + "
  15. How to create a Discord bot under 15 minutes
  16. \n", + "
  17. How to go from scratch to Create-React-App on Windows
  18. \n", + "
  19. How I built an async form validation library in ~100 lines of code with React Hooks
  20. \n", + "
  21. Introducing ABS, a programming language for shell scripting
  22. \n", + "
  23. How to write a better CV— the Web Developer edition
  24. \n", + "
  25. The React Handbook
  26. \n", + "
  27. Simple site hosting with Amazon S3 and HTTPS
  28. \n", + "
  29. How to Host a Static Website with S3, CloudFront and Route53
  30. \n", + "
  31. How to Publish An Application In The Play Store
  32. \n", + "
  33. The Strategy Pattern explained using Java
  34. \n", + "
  35. How to calculate Binary Tree height with the recursive method
  36. \n", + "
  37. I landed an internship at Facebook. Here are some tips I learned.
  38. \n", + "
  39. Essential Gems for Rails Applications
  40. \n", + "
  41. How to secure and manage secrets using Google Cloud KMS
  42. \n", + "
  43. How to Pass Oracle’s Java Certifications — a Practical Guide for Developers
  44. \n", + "
  45. Master the art of looping in JavaScript with these incredible tricks
  46. \n", + "
  47. The art of asking questions
  48. \n", + "
  49. The Definitive Guide to Contributing to Open Source
  50. \n" + ] + }, + { + "data": { + "text/plain": [ + "5748" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import time\n", + "soup = BeautifulSoup(buff, \"html.parser\")\n", + "#print(soup.prettify())\n", + "all_news = soup.find_all('a')\n", + "#print(all_news[0])\n", + "#print(type(all_news))\n", + "#html_links = \"\"\n", + "html_links =\"
      \"\n", + "for news in all_news:\n", + " head = news.find('h3')\n", + " if head:\n", + " #Visit our HTML tutorial \n", + " lnks = \"
    1. {1}
    2. \".format(news.get('href'), head.text)\n", + " html_links = html_links + \"\\n\" + lnks\n", + " #print((news.get('href')))\n", + " #print(type(head))\n", + " #print(head.attrs)\n", + " #print(head.text)\n", + "print(html_links)\n", + "html_links = html_links + \"
    \"\n", + "fname = '/home/ashutosh/Desktop/WebCrawler/result/'+ str(time.strftime(\"%y-%m%-d\")) + \".html\"\n", + "fp = open(fname, 'w')\n", + "fp.write(html_links)\n", + "#print(type(par))\n", + "#print(par)\n", + "#print((all_news[0].parent.name))\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "\n", + "\n", + "\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/.ipynb_checkpoints/test-checkpoint.ipynb b/.ipynb_checkpoints/test-checkpoint.ipynb new file mode 100644 index 0000000..2fd6442 --- /dev/null +++ b/.ipynb_checkpoints/test-checkpoint.ipynb @@ -0,0 +1,6 @@ +{ + "cells": [], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/.ipynb_checkpoints/web_log-checkpoint.ipynb b/.ipynb_checkpoints/web_log-checkpoint.ipynb new file mode 100644 index 0000000..8c89322 --- /dev/null +++ b/.ipynb_checkpoints/web_log-checkpoint.ipynb @@ -0,0 +1,62 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [], + "source": [ + "import logging\n", + "#for handler in logging.root.handlers[:]:\n", + "# logging.root.removeHandler(handler)\n", + "\n", + "logging.basicConfig(filename = \"wb.log\", format = '%(asctime)s-%(levelname)s - %(message)s', level=logging.INFO, filemode = 'w')\n", + "log = logging.getLogger(__name__)\n", + "#log.setLevel(20)\n", + "log.info(\"logging outputr\")" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/HTML/b.txt b/HTML/b.txt new file mode 100644 index 0000000..12a6eac --- /dev/null +++ b/HTML/b.txt @@ -0,0 +1 @@ +sjsjj \ No newline at end of file diff --git a/HTML/medium_html b/HTML/medium_html new file mode 100644 index 0000000..dd8d5e6 --- /dev/null +++ b/HTML/medium_html @@ -0,0 +1,237 @@ +freeCodeCamp.org
    Join a community of millions of people who are learning new skills together, building their portfolios, and getting developer jobs.
    \ No newline at end of file diff --git a/app.log b/app.log new file mode 100644 index 0000000..97dcf1f --- /dev/null +++ b/app.log @@ -0,0 +1 @@ +root - INFO - s diff --git a/result/19-0116.html b/result/19-0116.html new file mode 100644 index 0000000..7c70308 --- /dev/null +++ b/result/19-0116.html @@ -0,0 +1,26 @@ +
      +
    1. The Mobile App Launch Checklist — How to Ship Apps Like a Boss
    2. +
    3. How To Master Async/Await With This Real World Example
    4. +
    5. Here are some super secret VS Code hacks to boost your productivity
    6. +
    7. Removing JavaScript’s “this” keyword makes it a better language. Here’s why.
    8. +
    9. A chaotic mind leads to chaotic code
    10. +
    11. I know nothing, but it is okay
    12. +
    13. Which Programming Language Should You Learn Next?
    14. +
    15. How to create a Discord bot under 15 minutes
    16. +
    17. How to go from scratch to Create-React-App on Windows
    18. +
    19. How I built an async form validation library in ~100 lines of code with React Hooks
    20. +
    21. Introducing ABS, a programming language for shell scripting
    22. +
    23. How to write a better CV— the Web Developer edition
    24. +
    25. The React Handbook
    26. +
    27. Simple site hosting with Amazon S3 and HTTPS
    28. +
    29. How to Host a Static Website with S3, CloudFront and Route53
    30. +
    31. How to Publish An Application In The Play Store
    32. +
    33. The Strategy Pattern explained using Java
    34. +
    35. How to calculate Binary Tree height with the recursive method
    36. +
    37. I landed an internship at Facebook. Here are some tips I learned.
    38. +
    39. Essential Gems for Rails Applications
    40. +
    41. How to secure and manage secrets using Google Cloud KMS
    42. +
    43. How to Pass Oracle’s Java Certifications — a Practical Guide for Developers
    44. +
    45. Master the art of looping in JavaScript with these incredible tricks
    46. +
    47. The art of asking questions
    48. +
    49. The Definitive Guide to Contributing to Open Source
    diff --git a/scrape.ipynb b/scrape.ipynb new file mode 100644 index 0000000..1e622b4 --- /dev/null +++ b/scrape.ipynb @@ -0,0 +1,163 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "def retrieve_from_web(url, user_agent, fname):\n", + " request = urllib.request.Request(url, headers = {'User-Agent': user_agent})\n", + " response = urllib.request.urlopen(request)\n", + " html = response.read()\n", + " fname = '/home/ashutosh/Desktop/WebCrawler/HTML/' + fname\n", + " fp = open(fname, 'wb')\n", + " fp.write(html)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "def read_html():\n", + " fp = open('/home/ashutosh/Desktop/WebCrawler/HTML/medium_html', 'r')\n", + " buff = fp.read()\n", + " return buff" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "#from urllib.request import urlopen\n", + "import urllib.request\n", + "from bs4 import BeautifulSoup\n", + "user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36'\n", + "url = 'https://medium.freecodecamp.org/'\n", + "#retrieve_from_web(url, user_agent, 'medium_html')\n", + "buff = read_html()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "
      \n", + "
    1. The Mobile App Launch Checklist — How to Ship Apps Like a Boss
    2. \n", + "
    3. How To Master Async/Await With This Real World Example
    4. \n", + "
    5. Here are some super secret VS Code hacks to boost your productivity
    6. \n", + "
    7. Removing JavaScript’s “this” keyword makes it a better language. Here’s why.
    8. \n", + "
    9. A chaotic mind leads to chaotic code
    10. \n", + "
    11. I know nothing, but it is okay
    12. \n", + "
    13. Which Programming Language Should You Learn Next?
    14. \n", + "
    15. How to create a Discord bot under 15 minutes
    16. \n", + "
    17. How to go from scratch to Create-React-App on Windows
    18. \n", + "
    19. How I built an async form validation library in ~100 lines of code with React Hooks
    20. \n", + "
    21. Introducing ABS, a programming language for shell scripting
    22. \n", + "
    23. How to write a better CV— the Web Developer edition
    24. \n", + "
    25. The React Handbook
    26. \n", + "
    27. Simple site hosting with Amazon S3 and HTTPS
    28. \n", + "
    29. How to Host a Static Website with S3, CloudFront and Route53
    30. \n", + "
    31. How to Publish An Application In The Play Store
    32. \n", + "
    33. The Strategy Pattern explained using Java
    34. \n", + "
    35. How to calculate Binary Tree height with the recursive method
    36. \n", + "
    37. I landed an internship at Facebook. Here are some tips I learned.
    38. \n", + "
    39. Essential Gems for Rails Applications
    40. \n", + "
    41. How to secure and manage secrets using Google Cloud KMS
    42. \n", + "
    43. How to Pass Oracle’s Java Certifications — a Practical Guide for Developers
    44. \n", + "
    45. Master the art of looping in JavaScript with these incredible tricks
    46. \n", + "
    47. The art of asking questions
    48. \n", + "
    49. The Definitive Guide to Contributing to Open Source
    50. \n" + ] + }, + { + "data": { + "text/plain": [ + "5748" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import time\n", + "soup = BeautifulSoup(buff, \"html.parser\")\n", + "#print(soup.prettify())\n", + "all_news = soup.find_all('a')\n", + "#print(all_news[0])\n", + "#print(type(all_news))\n", + "#html_links = \"\"\n", + "html_links =\"
        \"\n", + "for news in all_news:\n", + " head = news.find('h3')\n", + " if head:\n", + " #Visit our HTML tutorial \n", + " lnks = \"
      1. {1}
      2. \".format(news.get('href'), head.text)\n", + " html_links = html_links + \"\\n\" + lnks\n", + " #print((news.get('href')))\n", + " #print(type(head))\n", + " #print(head.attrs)\n", + " #print(head.text)\n", + "print(html_links)\n", + "html_links = html_links + \"
      \"\n", + "fname = '/home/ashutosh/Desktop/WebCrawler/result/'+ str(time.strftime(\"%y-%m%-d\")) + \".html\"\n", + "fp = open(fname, 'w')\n", + "fp.write(html_links)\n", + "#print(type(par))\n", + "#print(par)\n", + "#print((all_news[0].parent.name))\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "\n", + "\n", + "\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/test.ipynb b/test.ipynb new file mode 100644 index 0000000..84f6b26 --- /dev/null +++ b/test.ipynb @@ -0,0 +1,292 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "freeCodeCamp.org

      freeCodeCamp.org

      Stories worth reading about programming and technology from our open source community.

      Join a community of millions of people who are learning new skills together, building their portfolios, and getting developer jobs.
      \n" + ] + } + ], + "source": [ + "\n", + "a = '/home/ashutosh/Desktop/WebCrawler/HTML/' + 'b.txt'\n", + "fp = open(a, 'w')\n", + "fp.write(\"sjsjj\")\n", + "\n", + "fp = open('/home/ashutosh/Desktop/WebCrawler/HTML/medium_html', 'r')\n", + "buff = fp.read()\n", + "print(buff)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/wb.log b/wb.log new file mode 100644 index 0000000..607816c --- /dev/null +++ b/wb.log @@ -0,0 +1 @@ +2019-01-05 15:36:42,211 - INFO logging outputr diff --git a/web_log.ipynb b/web_log.ipynb new file mode 100644 index 0000000..ca7dda2 --- /dev/null +++ b/web_log.ipynb @@ -0,0 +1,73 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [], + "source": [ + "import logging\n", + "#for handler in logging.root.handlers[:]:\n", + "# logging.root.removeHandler(handler)\n", + "\n", + "logging.basicConfig(filename = \"wb.log\", format = '%(asctime)s-%(levelname)s - %(message)s', level=logging.INFO, filemode = 'w')\n", + "log = logging.getLogger(__name__)\n", + "#log.setLevel(20)\n", + "log.info(\"logging outputr\")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "19-01-16\n" + ] + } + ], + "source": [ + "import time\n", + "print(time.strftime(\"%y-%m-%d\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}