Google Analytic Webcrawler for every country code

Posted on

Problem

At the moment I’d like recommendation where I can change and what I can use to improve on simplicity and modularity. If I’m practicing good naming conventions as to provide clean readable code. Any critique is much appreciated.

import time
import re
import requests
from bs4 import BeautifulSoup, SoupStrainer
import pandas as pd
import time

SESSION = requests.Session()


""" This is the Google Analytics Selector Section """


class myGoogleSession:

    def fetch_google_xml(self, URL, country_code):
        format_url = f"{URL}{country_code}"
        response = SESSION.get(format_url)
        soup = BeautifulSoup(response.text, 'xml',
                             parse_only=SoupStrainer('channel'))
        return soup


google_session = myGoogleSession()


def google_trends_retriever(URL, country_code):
    xml_soup = google_session.fetch_google_xml(URL, country_code)
    print(country_code)
    return[(title.text, re.sub("[+,]", "", traffic.text))
           for title, traffic in zip(xml_soup.find_all('title')[1:],
                                     xml_soup.find_all('ht:approx_traffic'))]


def create_pdTrend(data):
    check_panda = pd.DataFrame(
        google_trends_retriever(GoogleURL, data),
        columns=['Title', 'Score']
    )
    if len(check_panda) == 0:
        print('No available data')
    else:
        return check_panda


""" This is the Country Code Selector Section """


country_code_list = []


class myCountryCodeSession:
    def fetch_countrycode_html(self, URL):
        response = SESSION.get(URL)
        soup = BeautifulSoup(response.text, 'html.parser',
                             parse_only=SoupStrainer('table'))
        return soup


countryCode_session = myCountryCodeSession()


def parse_row(url):
    rows = countryCode_session.fetch_countrycode_html(url)
    _rows = rows.findChildren(['td', 'tr'])
    for row in _rows:
        cells = row.findChildren('td')[2:3]
        for cell in cells:
            value = cell.string
            country_code_list.append(value[:2])
    return None


def create_pdCountryCode(country_code):
    return pd.DataFrame({'Country_Code': country_code})


def iterate_List(data):
    i = 1
    while i <= 239:
        selected_CountryCode = get_data_fromList(i)
        print(create_pdTrend(selected_CountryCode))
        i += 1
    else:
        print('Has reach the end of i ' + str(i))


def get_data_fromList(num):
    key = num-1
    for i in country_code_list[key:num]:
        return str(i)


if __name__ == '__main__':
    """ URL Section """
    GoogleURL = "https://trends.google.com/trends/trendingsearches/daily/rss?geo="
    CountryCodeURL = "https://countrycode.org/"
    """-------------"""
    start = time.time()
    print("hello")

    """Country Code Section """
    parse_row(CountryCodeURL)
    """---------------------"""

    """Google Analytics Section """
    iterate_List(country_code_list)
    """-------------------------"""
    end = time.time()
    print(end - start)

Solution

PEP8

This is the official Python style guide. If you are interested in good naming conventions and other good practices, you can start here.

Among other things, you code would benefit from:

  • variable name using lower_snake_case;
  • class name using PascalCase;
  • comments delimited using # and not raw strings in the code;
  • redundant/useless parts of the code removed.

A first rewrite would yield:

import re
import time

import requests
import pandas as pd
from bs4 import BeautifulSoup, SoupStrainer


SESSION = requests.Session()


# This is the Google Analytics Selector Section
class GoogleSession:
    def fetch_google_xml(self, URL, country_code):
        response = SESSION.get(f"{URL}{country_code}")
        return BeautifulSoup(
                response.text, 'xml',
                parse_only=SoupStrainer('channel'))


google_session = GoogleSession()


def google_trends_retriever(URL, country_code):
    xml_soup = google_session.fetch_google_xml(URL, country_code)
    print(country_code)
    titles = xml_soup.find_all('title')[1:]
    traffics = xml_soup.find_all('ht:approx_traffic')
    return [
            (title.text, re.sub("[+,]", "", traffic.text))
            for title, traffic in zip(titles, traffics)
    ]


def create_pd_trend(data):
    check_panda = pd.DataFrame(
            google_trends_retriever(google_URL, data),
            columns=['Title', 'Score'],
    )
    if len(check_panda) == 0:
        print('No available data')
    else:
        return check_panda


# This is the Country Code Selector Section
country_code_list = []


class CountryCodeSession:
    def fetch_country_code_html(self, URL):
        response = SESSION.get(URL)
        return BeautifulSoup(
                response.text, 'html.parser',
                parse_only=SoupStrainer('table'))


country_code_session = CountryCodeSession()


def parse_row(url):
    rows = country_code_session.fetch_country_code_html(url)
    for row in rows.find_all(['td', 'tr']):
        cells = row.find_all('td')[2:3]
        for cell in cells:
            value = cell.string
            country_code_list.append(value[:2])


def iterate_list(data):
    i = 1
    while i <= 239:
        selected_country_code = get_data_from_list(i)
        print(create_pd_trend(selected_country_code))
        i += 1
    else:
        print('Has reach the end of i', i)


def get_data_from_list(num):
    key = num - 1
    for i in country_code_list[key:num]:
        return str(i)


if __name__ == '__main__':
    # URL Section
    google_URL = "https://trends.google.com/trends/trendingsearches/daily/rss?geo="
    country_code_URL = "https://countrycode.org/"
    # -------------
    start = time.time()
    print("hello")

    # Country Code Section
    parse_row(country_code_URL)
    # ---------------------

    # Google Analytics Section
    iterate_list(country_code_list)
    # -------------------------
    end = time.time()
    print(end - start)

Loop like a native

When I saw

def get_data_fromList(num):
    key = num-1
    for i in country_code_list[key:num]:
        return str(i)

I wondered why you would write such convoluted code. Extracting a sublist of one element to iterate over it and return the first one… You can simplify that to

def get_data_from_list(num):
    return str(country_code_list[num - 1])

But I wondered why using a method for that, and saw how you iterated over indices to call this function. Don’t. Use a for-loop as it is meant to be used: by iterating over the content directly.

This would yield:

import re
import time

import requests
import pandas as pd
from bs4 import BeautifulSoup, SoupStrainer


SESSION = requests.Session()


# This is the Google Analytics Selector Section
class GoogleSession:
    def fetch_google_xml(self, URL, country_code):
        response = SESSION.get(f"{URL}{country_code}")
        return BeautifulSoup(
                response.text, 'xml',
                parse_only=SoupStrainer('channel'))


google_session = GoogleSession()


def google_trends_retriever(URL, country_code):
    xml_soup = google_session.fetch_google_xml(URL, country_code)
    print(country_code)
    titles = xml_soup.find_all('title')[1:]
    traffics = xml_soup.find_all('ht:approx_traffic')
    return [
            (title.text, re.sub("[+,]", "", traffic.text))
            for title, traffic in zip(titles, traffics)
    ]


def create_pd_trend(data):
    check_panda = pd.DataFrame(
            google_trends_retriever(google_URL, data),
            columns=['Title', 'Score'],
    )
    if len(check_panda) == 0:
        print('No available data')
    else:
        return check_panda


# This is the Country Code Selector Section
class CountryCodeSession:
    def fetch_country_code_html(self, URL):
        response = SESSION.get(URL)
        return BeautifulSoup(
                response.text, 'html.parser',
                parse_only=SoupStrainer('table'))


country_code_session = CountryCodeSession()


def parse_row(url):
    rows = country_code_session.fetch_country_code_html(url)
    return [
            cell.string[:2]
            for row in rows.find_all(['td', 'tr'])
            for cell in row.find_all('td')[2:3]
    ]


def iterate_list(country_codes):
    for country_code in country_codes:
        print(create_pd_trend(str(country_code)))
    else:
        print('Has reach the end of i', len(country_codes))


if __name__ == '__main__':
    # URL Section
    google_URL = "https://trends.google.com/trends/trendingsearches/daily/rss?geo="
    country_code_URL = "https://countrycode.org/"
    # -------------
    start = time.time()
    print("hello")

    # Country Code Section
    country_code_list = parse_row(country_code_URL)
    # ---------------------

    # Google Analytics Section
    iterate_list(country_code_list)
    # -------------------------
    end = time.time()
    print(end - start)

Stop writting classes

Your classes add absolutely no value over a single function. You don’t store state that you reuse after each call. You don’t share state between several functions. They are plain functions in a namespace, just let them be plain functions.

This code can benefit from using a class, but not like that.

Parse bytes, not text

lxml, which is the underlying parser used when instructing BeautifulSoup to decode 'xml' explicitly works with raw bytes rather than decoded text. This is to be able to detect explicit encoding declarations and decode the rest of the document appropriately; so you will never have decoding errors.

This means that you need to feed the response.content rather than response.text to BeautifulSoup when parsing XML.

Manage your state properly

Your code heavily relly on global variables and printing data to work. This is the worst part of your code as it make it barely reusable and harder to test properly (think unittest or doctest).

Instead of using global variables, pass them around as parameters and return them from your functions.

Instead of printing results, return values from your functions. This make it easier to extract and massage data into your liking.

There is also the global SESSION that is used throughout the code. I’d encapsulate that into a class to have a single session per instance so you can easily crawl several addresses if need be.

My take on the problem would be:

import re
from functools import partial

import requests
import pandas as pd
from bs4 import BeautifulSoup, SoupStrainer


class GoogleAnalysis:
    def __init__(self, url):
        session = requests.Session()
        self.get_url = partial(session.get, url)

    def _fetch_xml(self, country_code):
        response = self.get_url(params={'geo': country_code})
        return BeautifulSoup(
                response.content, 'xml',
                parse_only=SoupStrainer('channel'))

    def _retrieve_trends(self, country_code):
        soup = self._fetch_xml(country_code)
        titles = soup.find_all('title')[1:]
        traffics = soup.find_all('ht:approx_traffic')
        return [
                (title.text, re.sub("[+,]", "", traffic.text))
                for title, traffic in zip(titles, traffics)
        ]

    def trends(self, country_code):
        df = pd.DataFrame(
                self._retrieve_trends(country_code),
                columns=['Title', 'Score'],
        )
        df['Country Code'] = country_code
        return df


def country_codes(url='https://countrycode.org/'):
    response = requests.get(url)
    soup = BeautifulSoup(
            response.text, 'lxml',
            parse_only=SoupStrainer('table'))
    return [
            cell.string[:2]
            for row in soup.find_all(['td', 'tr'])
            # Some rows don't define row.find_all('td')[2] so filter out
            for cell in row.find_all('td')[2:3]
    ]


def main(url):
    google = GoogleAnalysis(url)
    codes = country_codes()
    return pd.concat([
        google.trends(country_code)
        # Country codes are repeated twice, we only need them once
        for country_code in codes[:len(codes) // 2]
    ])


if __name__ == '__main__':
    import time
    start = time.perf_counter()
    print('Hello!')
    trends = main('https://trends.google.com/trends/trendingsearches/daily/rss')
    print(trends.to_string(index=False))
    print(time.perf_counter() - start)

Note the print(trends.to_string(index=False)) at the end, this could be whatever you like, either printing to CSV or using trends.groupby to redo your old formatting. The idea here is that the computation is done without printing anything. You get to format the data at the end however you like.

Leave a Reply

Your email address will not be published. Required fields are marked *