Problem
At the moment I’d like recommendation where I can change and what I can use to improve on simplicity and modularity. If I’m practicing good naming conventions as to provide clean readable code. Any critique is much appreciated.
import time
import re
import requests
from bs4 import BeautifulSoup, SoupStrainer
import pandas as pd
import time
SESSION = requests.Session()
""" This is the Google Analytics Selector Section """
class myGoogleSession:
def fetch_google_xml(self, URL, country_code):
format_url = f"{URL}{country_code}"
response = SESSION.get(format_url)
soup = BeautifulSoup(response.text, 'xml',
parse_only=SoupStrainer('channel'))
return soup
google_session = myGoogleSession()
def google_trends_retriever(URL, country_code):
xml_soup = google_session.fetch_google_xml(URL, country_code)
print(country_code)
return[(title.text, re.sub("[+,]", "", traffic.text))
for title, traffic in zip(xml_soup.find_all('title')[1:],
xml_soup.find_all('ht:approx_traffic'))]
def create_pdTrend(data):
check_panda = pd.DataFrame(
google_trends_retriever(GoogleURL, data),
columns=['Title', 'Score']
)
if len(check_panda) == 0:
print('No available data')
else:
return check_panda
""" This is the Country Code Selector Section """
country_code_list = []
class myCountryCodeSession:
def fetch_countrycode_html(self, URL):
response = SESSION.get(URL)
soup = BeautifulSoup(response.text, 'html.parser',
parse_only=SoupStrainer('table'))
return soup
countryCode_session = myCountryCodeSession()
def parse_row(url):
rows = countryCode_session.fetch_countrycode_html(url)
_rows = rows.findChildren(['td', 'tr'])
for row in _rows:
cells = row.findChildren('td')[2:3]
for cell in cells:
value = cell.string
country_code_list.append(value[:2])
return None
def create_pdCountryCode(country_code):
return pd.DataFrame({'Country_Code': country_code})
def iterate_List(data):
i = 1
while i <= 239:
selected_CountryCode = get_data_fromList(i)
print(create_pdTrend(selected_CountryCode))
i += 1
else:
print('Has reach the end of i ' + str(i))
def get_data_fromList(num):
key = num-1
for i in country_code_list[key:num]:
return str(i)
if __name__ == '__main__':
""" URL Section """
GoogleURL = "https://trends.google.com/trends/trendingsearches/daily/rss?geo="
CountryCodeURL = "https://countrycode.org/"
"""-------------"""
start = time.time()
print("hello")
"""Country Code Section """
parse_row(CountryCodeURL)
"""---------------------"""
"""Google Analytics Section """
iterate_List(country_code_list)
"""-------------------------"""
end = time.time()
print(end - start)
Solution
PEP8
This is the official Python style guide. If you are interested in good naming conventions and other good practices, you can start here.
Among other things, you code would benefit from:
- variable name using
lower_snake_case
; - class name using
PascalCase
; - comments delimited using
#
and not raw strings in the code; - redundant/useless parts of the code removed.
A first rewrite would yield:
import re
import time
import requests
import pandas as pd
from bs4 import BeautifulSoup, SoupStrainer
SESSION = requests.Session()
# This is the Google Analytics Selector Section
class GoogleSession:
def fetch_google_xml(self, URL, country_code):
response = SESSION.get(f"{URL}{country_code}")
return BeautifulSoup(
response.text, 'xml',
parse_only=SoupStrainer('channel'))
google_session = GoogleSession()
def google_trends_retriever(URL, country_code):
xml_soup = google_session.fetch_google_xml(URL, country_code)
print(country_code)
titles = xml_soup.find_all('title')[1:]
traffics = xml_soup.find_all('ht:approx_traffic')
return [
(title.text, re.sub("[+,]", "", traffic.text))
for title, traffic in zip(titles, traffics)
]
def create_pd_trend(data):
check_panda = pd.DataFrame(
google_trends_retriever(google_URL, data),
columns=['Title', 'Score'],
)
if len(check_panda) == 0:
print('No available data')
else:
return check_panda
# This is the Country Code Selector Section
country_code_list = []
class CountryCodeSession:
def fetch_country_code_html(self, URL):
response = SESSION.get(URL)
return BeautifulSoup(
response.text, 'html.parser',
parse_only=SoupStrainer('table'))
country_code_session = CountryCodeSession()
def parse_row(url):
rows = country_code_session.fetch_country_code_html(url)
for row in rows.find_all(['td', 'tr']):
cells = row.find_all('td')[2:3]
for cell in cells:
value = cell.string
country_code_list.append(value[:2])
def iterate_list(data):
i = 1
while i <= 239:
selected_country_code = get_data_from_list(i)
print(create_pd_trend(selected_country_code))
i += 1
else:
print('Has reach the end of i', i)
def get_data_from_list(num):
key = num - 1
for i in country_code_list[key:num]:
return str(i)
if __name__ == '__main__':
# URL Section
google_URL = "https://trends.google.com/trends/trendingsearches/daily/rss?geo="
country_code_URL = "https://countrycode.org/"
# -------------
start = time.time()
print("hello")
# Country Code Section
parse_row(country_code_URL)
# ---------------------
# Google Analytics Section
iterate_list(country_code_list)
# -------------------------
end = time.time()
print(end - start)
Loop like a native
When I saw
def get_data_fromList(num):
key = num-1
for i in country_code_list[key:num]:
return str(i)
I wondered why you would write such convoluted code. Extracting a sublist of one element to iterate over it and return the first one⦠You can simplify that to
def get_data_from_list(num):
return str(country_code_list[num - 1])
But I wondered why using a method for that, and saw how you iterated over indices to call this function. Don’t. Use a for-loop as it is meant to be used: by iterating over the content directly.
This would yield:
import re
import time
import requests
import pandas as pd
from bs4 import BeautifulSoup, SoupStrainer
SESSION = requests.Session()
# This is the Google Analytics Selector Section
class GoogleSession:
def fetch_google_xml(self, URL, country_code):
response = SESSION.get(f"{URL}{country_code}")
return BeautifulSoup(
response.text, 'xml',
parse_only=SoupStrainer('channel'))
google_session = GoogleSession()
def google_trends_retriever(URL, country_code):
xml_soup = google_session.fetch_google_xml(URL, country_code)
print(country_code)
titles = xml_soup.find_all('title')[1:]
traffics = xml_soup.find_all('ht:approx_traffic')
return [
(title.text, re.sub("[+,]", "", traffic.text))
for title, traffic in zip(titles, traffics)
]
def create_pd_trend(data):
check_panda = pd.DataFrame(
google_trends_retriever(google_URL, data),
columns=['Title', 'Score'],
)
if len(check_panda) == 0:
print('No available data')
else:
return check_panda
# This is the Country Code Selector Section
class CountryCodeSession:
def fetch_country_code_html(self, URL):
response = SESSION.get(URL)
return BeautifulSoup(
response.text, 'html.parser',
parse_only=SoupStrainer('table'))
country_code_session = CountryCodeSession()
def parse_row(url):
rows = country_code_session.fetch_country_code_html(url)
return [
cell.string[:2]
for row in rows.find_all(['td', 'tr'])
for cell in row.find_all('td')[2:3]
]
def iterate_list(country_codes):
for country_code in country_codes:
print(create_pd_trend(str(country_code)))
else:
print('Has reach the end of i', len(country_codes))
if __name__ == '__main__':
# URL Section
google_URL = "https://trends.google.com/trends/trendingsearches/daily/rss?geo="
country_code_URL = "https://countrycode.org/"
# -------------
start = time.time()
print("hello")
# Country Code Section
country_code_list = parse_row(country_code_URL)
# ---------------------
# Google Analytics Section
iterate_list(country_code_list)
# -------------------------
end = time.time()
print(end - start)
Stop writting classes
Your classes add absolutely no value over a single function. You don’t store state that you reuse after each call. You don’t share state between several functions. They are plain functions in a namespace, just let them be plain functions.
This code can benefit from using a class, but not like that.
Parse bytes, not text
lxml
, which is the underlying parser used when instructing BeautifulSoup
to decode 'xml'
explicitly works with raw bytes rather than decoded text. This is to be able to detect explicit encoding declarations and decode the rest of the document appropriately; so you will never have decoding errors.
This means that you need to feed the response.content
rather than response.text
to BeautifulSoup
when parsing XML.
Manage your state properly
Your code heavily relly on global variables and print
ing data to work. This is the worst part of your code as it make it barely reusable and harder to test properly (think unittest
or doctest
).
Instead of using global variables, pass them around as parameters and return them from your functions.
Instead of printing results, return values from your functions. This make it easier to extract and massage data into your liking.
There is also the global SESSION
that is used throughout the code. I’d encapsulate that into a class to have a single session per instance so you can easily crawl several addresses if need be.
My take on the problem would be:
import re
from functools import partial
import requests
import pandas as pd
from bs4 import BeautifulSoup, SoupStrainer
class GoogleAnalysis:
def __init__(self, url):
session = requests.Session()
self.get_url = partial(session.get, url)
def _fetch_xml(self, country_code):
response = self.get_url(params={'geo': country_code})
return BeautifulSoup(
response.content, 'xml',
parse_only=SoupStrainer('channel'))
def _retrieve_trends(self, country_code):
soup = self._fetch_xml(country_code)
titles = soup.find_all('title')[1:]
traffics = soup.find_all('ht:approx_traffic')
return [
(title.text, re.sub("[+,]", "", traffic.text))
for title, traffic in zip(titles, traffics)
]
def trends(self, country_code):
df = pd.DataFrame(
self._retrieve_trends(country_code),
columns=['Title', 'Score'],
)
df['Country Code'] = country_code
return df
def country_codes(url='https://countrycode.org/'):
response = requests.get(url)
soup = BeautifulSoup(
response.text, 'lxml',
parse_only=SoupStrainer('table'))
return [
cell.string[:2]
for row in soup.find_all(['td', 'tr'])
# Some rows don't define row.find_all('td')[2] so filter out
for cell in row.find_all('td')[2:3]
]
def main(url):
google = GoogleAnalysis(url)
codes = country_codes()
return pd.concat([
google.trends(country_code)
# Country codes are repeated twice, we only need them once
for country_code in codes[:len(codes) // 2]
])
if __name__ == '__main__':
import time
start = time.perf_counter()
print('Hello!')
trends = main('https://trends.google.com/trends/trendingsearches/daily/rss')
print(trends.to_string(index=False))
print(time.perf_counter() - start)
Note the print(trends.to_string(index=False))
at the end, this could be whatever you like, either printing to CSV or using trends.groupby
to redo your old formatting. The idea here is that the computation is done without print
ing anything. You get to format the data at the end however you like.