TransWikia.com

Need Help Implementing An If/Else Or Case Statement Into Python Script

Stack Overflow Asked by Sam Lee on December 22, 2021

The code crawls the website and then prints out internal and external links and stores it into a txt, json, xml, and csv file.
Need help with implementing an if/else or case statement where I can choose which file format I can have the results (internal and external links) in.
Also, need help making this more optimize or better than it currently is if possible.
How to execute program: python filename url
My Output:
Total Internal Links: ….
Total External Links: ….
Total Links: …
Then exports those links to a txt, son, csv, and XML file.

import requests
import argparse
import time
import json
import random
import pandas as pd
import os
import xml.etree.ElementTree as xml
from urllib.request import urlparse, urljoin
from bs4 import BeautifulSoup

internal_links = set()
external_links = set()
urls = []
total_links_visited = 0

#check if url is valid
def is_valid(url):
   parsed = urlparse(url)
   return bool(parsed.netloc) and bool(parsed.scheme)

#this function finds and prints out the internal and external links
def get_all_website_links(url):
   global urls
   domain_name = urlparse(url).netloc
   res1 = requests.get(url)
   soup = BeautifulSoup(res1.content, "html.parser")
   for a_tag in soup.findAll("a"):
      href_tag = a_tag.attrs.get("href")
      if href_tag:
         href_tag = urljoin(url, href_tag)
         parsed_href = urlparse(href_tag)
         href_tag = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path
         if is_valid(href_tag):
            if domain_name not in urlparse(href_tag).netloc and href_tag not in external_links:
               print(f"External link: {href_tag}")
               external_links.add(href_tag)
               continue
            elif href_tag not in urls:
               print(f"Internal link: {href_tag}")
               urls.append(href_tag)
               internal_links.add(href_tag)

#this function crawls a web page and extracts all links
def crawl(url, max_urls=50):
   global total_links_visited, urls
   total_links_visited += 1
   get_all_website_links(url)
   for link in urls:
      if total_links_visited > max_urls:
         break
      crawl(link, max_urls=max_urls)

#main function
def main():
   parser = argparse.ArgumentParser(description="Link Extractor Tool with Python")
   parser.add_argument("url", help="The URL to extract links from.")
   parser.add_argument("-m", "--max-urls", help="Number of max URLs to crawl, default is 30.", default=30, type=int)
   args = parser.parse_args()
   url = args.url
   max_urls = args.max_urls
   domain_name = urlparse(url).netloc
   res = requests.get(url)
   statuscode = res.status_code
   print("Status Code:", statuscode)
   if statuscode == 200: 
      crawl(url, max_urls=max_urls)
   else:
      print("Failed to get a request response back.")

   print("Total Internal Links:", len(internal_links))
   print("Total External Links:", len(external_links))
   print("Total Links:", len(external_links) + len(internal_links))
    
   with open(f"{domain_name}_internal_links.txt", "w") as f:
      for internal_link in internal_links:
         print(internal_link.strip(), file=f)
   with open(f"{domain_name}_external_links.txt", "w") as f:
      for external_link in external_links:
         print(external_link.strip(), file=f)

   #writing to json files
   f = open(f"{domain_name}_internal_links.json","w")
   json.dump({'internal_links':list(internal_links)}, f, indent=6)
   f.close()
   f = open(f"{domain_name}_external_links.json","w")
   json.dump({'external_links':list(external_links)}, f, indent=6)
   f.close()

   #writing to csv
   df = pd.DataFrame(list(internal_links))
   df.to_csv(f"{domain_name}_internal_links.csv", index=False, header=False)
   df = pd.DataFrame(list(external_links))
   df.to_csv(f"{domain_name}_external_links.csv", index=False, header=False)

   #writing to xml
   xmlformat = xml.Element("internal_links")
   xmlformat_1 = xml.SubElement(xmlformat, "link")
   for l in list(internal_links):
      xmlformat_1.text = str(l)
      xmlformat.append(xmlformat_1)
   tree = xml.ElementTree(xmlformat)
   tree.write(f"{domain_name}_internal_links.xml")

   xmlformat = xml.Element("external_links")
   xmlformat_1 = xml.SubElement(xmlformat, "link")
   for l in list(external_links):
      xmlformat_1.text = str(l)
      xmlformat.append(xmlformat_1)
   tree = xml.ElementTree(xmlformat)
   tree.write(f"{domain_name}_external_links.xml")

#executing the python script
if __name__ == "__main__":
   main()

One Answer

You can one more command line argument called output-file-format

import requests
import argparse
import time
import json
import random
import pandas as pd
import os
import xml.etree.ElementTree as xml
from urllib.request import urlparse, urljoin
from bs4 import BeautifulSoup

internal_links = set()
external_links = set()
urls = []
total_links_visited = 0

#check if url is valid
def is_valid(url):
   parsed = urlparse(url)
   return bool(parsed.netloc) and bool(parsed.scheme)

#this function finds and prints out the internal and external links
def get_all_website_links(url):
    global urls
    domain_name = urlparse(url).netloc
    res1 = requests.get(url)
    soup = BeautifulSoup(res1.content, "html.parser")
    for a_tag in soup.findAll("a"):
        href_tag = a_tag.attrs.get("href")
        if href_tag:
            href_tag = urljoin(url, href_tag)
            parsed_href = urlparse(href_tag)
            href_tag = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path
            if is_valid(href_tag):
                if domain_name not in urlparse(href_tag).netloc and href_tag not in external_links:
                    print(f"External link: {href_tag}")
                    external_links.add(href_tag)
                    continue
                elif href_tag not in urls:
                    print(f"Internal link: {href_tag}")
                    urls.append(href_tag)
                    internal_links.add(href_tag)

#this function crawls a web page and extracts all links
def crawl(url, max_urls=50):
    global total_links_visited, urls
    total_links_visited += 1
    get_all_website_links(url)
    for link in urls:
        if total_links_visited > max_urls:
            break
        crawl(link, max_urls=max_urls)

def save(output_file_format, domain_name, internal_links, external_links):
    if output_file_format == "json":
        #writing to json files
        f = open(f"{domain_name}_internal_links.json","w")
        json.dump({'internal_links':list(internal_links)}, f, indent=6)
        f.close()
        f = open(f"{domain_name}_external_links.json","w")
        json.dump({'external_links':list(external_links)}, f, indent=6)
        f.close()

    elif output_file_format == "csv":
        #writing to csv
        df = pd.DataFrame(list(internal_links))
        df.to_csv(f"{domain_name}_internal_links.csv", index=False, header=False)
        df = pd.DataFrame(list(external_links))
        df.to_csv(f"{domain_name}_external_links.csv", index=False, header=False)

    elif output_file_format == "xml":
        #writing to xml
        xmlformat = xml.Element("internal_links")
        xmlformat_1 = xml.SubElement(xmlformat, "link")
        for l in list(internal_links):
            xmlformat_1.text = str(l)
            xmlformat.append(xmlformat_1)
        tree = xml.ElementTree(xmlformat)
        tree.write(f"{domain_name}_internal_links.xml")

        xmlformat = xml.Element("external_links")
        xmlformat_1 = xml.SubElement(xmlformat, "link")
        for l in list(external_links):
            xmlformat_1.text = str(l)
            xmlformat.append(xmlformat_1)
        tree = xml.ElementTree(xmlformat)
        tree.write(f"{domain_name}_external_links.xml")
    else:
        with open(f"{domain_name}_internal_links.txt", "w") as f:
            for internal_link in internal_links:
                print(internal_link.strip(), file=f)
        with open(f"{domain_name}_external_links.txt", "w") as f:
            for external_link in external_links:
                print(external_link.strip(), file=f)

#main function
def main():
    parser = argparse.ArgumentParser(description="Link Extractor Tool with Python")
    parser.add_argument("url", help="The URL to extract links from.")
    parser.add_argument("-m", "--max-urls", help="Number of max URLs to crawl, default is 30.", default=30, type=int)
    parser.add_argument("-t", "--output-file-format", help="Output file format to store the data. Default text", default="txt")
    args = parser.parse_args()
    url = args.url
    max_urls = args.max_urls
    output_file_format = args.output_file_format
    domain_name = urlparse(url).netloc
    res = requests.get(url)
    statuscode = res.status_code
    print("Status Code:", statuscode)
    if statuscode == 200: 
        crawl(url, max_urls=max_urls)
    else:
        print("Failed to get a request response back.")

    print("Total Internal Links:", len(internal_links))
    print("Total External Links:", len(external_links))
    print("Total Links:", len(external_links) + len(internal_links))

    save(output_file_format, domain_name, internal_links, external_links)
   

#executing the python script
if __name__ == "__main__":
    main()

Usage:

usage: a.py [-h] [-m MAX_URLS] [-t OUTPUT_FILE_FORMAT] url
a.py: error: the following arguments are required: url

Running the file:

python pyfile3.py -m 1 -t csv https://www.youtube.com

Output:

Status Code: 200
Internal link: https://www.youtube.com/about/
Internal link: https://www.youtube.com/about/press/
Internal link: https://www.youtube.com/about/copyright/
Internal link: https://www.youtube.com/t/contact_us
Internal link: https://www.youtube.com/creators/
Internal link: https://www.youtube.com/ads/
External link: https://developers.google.com/youtube
Internal link: https://www.youtube.com/t/terms
External link: https://www.google.co.jp/intl/ja/policies/privacy/
Internal link: https://www.youtube.com/about/policies/
Internal link: https://www.youtube.com/howyoutubeworks
Internal link: https://www.youtube.com/new
Internal link: https://www.youtube.com/about/experiences/
Internal link: https://www.youtube.com/about/brand-resources/
External link: https://youtube.googleblog.com/
Internal link: https://www.youtube.com/trends/
External link: https://twitter.com/YouTube
External link: https://www.instagram.com/youtube/
External link: https://www.facebook.com/youtube/
Internal link: https://youtube.googleblog.com/
Internal link: https://www.youtube.com/jobs/
Internal link: https://www.youtube.com/howyoutubeworks/
External link: https://www.youtubego.com/
Internal link: https://www.youtube.com/kids/
Internal link: https://www.youtube.com/musicpremium
Internal link: https://www.youtube.com/channel/UCqVDpXKLmKeBU_yyt_QkItQ
Internal link: https://www.youtube.com/premium/
External link: https://studio.youtube.com/
External link: https://tv.youtube.com/
Internal link: https://www.youtube.com/yt/dev/
External link: https://artists.youtube.com/
External link: https://creatoracademy.youtube.com/page/education
Internal link: https://www.youtube.com/yt/family/
External link: https://youtube.com/creatorresearch/
External link: https://servicesdirectory.withyoutube.com/
Internal link: https://www.youtube.com/nextup
Internal link: https://www.youtube.com/space/
External link: https://vr.youtube.com/
Internal link: https://www.youtube.com/creators-for-change/
External link: https://youtube.com/csai-match/
External link: https://socialimpact.youtube.com/
Internal link: https://www.youtubego.com/
Internal link: https://studio.youtube.com/
Internal link: https://tv.youtube.com/
Internal link: https://artists.youtube.com/
Internal link: https://creatoracademy.youtube.com/page/education
Internal link: https://youtube.com/creatorresearch/
Internal link: https://servicesdirectory.withyoutube.com/
Internal link: https://vr.youtube.com/
Internal link: https://youtube.com/csai-match/
Internal link: https://socialimpact.youtube.com/
Internal link: https://www.youtube.com
External link: https://www.google.com/policies/privacy/
External link: https://support.google.com/youtube/
Total Internal Links: 36
Total External Links: 18
Total Links: 54

Answered by bigbounty on December 22, 2021

Add your own answers!

Ask a Question

Get help from others!

© 2024 TransWikia.com. All rights reserved. Sites we Love: PCI Database, UKBizDB, Menu Kuliner, Sharing RPP