Problem
I made a little program to parse data from an API.
I do not have experience with Pandas. It is working but I would like to know how to do that better and more efficiently.
class Albion_data():
def __init__(self,item):
self.location = f'Bridgewatch,Thetford,FortSterling,Martlock,Bridgewatch,Lymhurst'
self.item = item
@property
def get_data(self):
url = f"https://www.albion-online-data.com/api/v1/stats/Prices/{self.item}?locations={self.location}"
response = (requests.get(url).text)
response_json = json.loads(response)
if response_json != []:
response_json = pd.DataFrame(response_json)[["item_id", "city", "sell_price_min"]]
max = response_json.loc[response_json["sell_price_min"].idxmax()]
min = response_json.loc[response_json["sell_price_min"].idxmin()]
gain = max[2] - min[2]
data = pd.DataFrame(
[[max[0], max[1], min[1], min[2], max[2], gain]],
columns=["ITEM", "CITY max ", "CITY min", "MIN_PRICE", "MAX PRICE", "GAIN"])
return data
def get_data_frame():
item_list = pd.read_csv('items.txt', sep=':', names=['nuber', 'item']['item']
item_price = pd.DataFrame(columns=["ITEM", "CITY max ", "CITY min","MIN_PRICE", "MAX PRICE", "GAIN"])
for item in item_list:
item_price = item_price.append(Albion_data(item).get_data)
item_price.to_csv('data.csv')
return item_price
```
Solution
Do you need a class?
Right now, I’m not sure you need a class. Your class has two methods, one of which is __init__
, so this could be refactored to be a function
def get_data(item):
location = f'Bridgewatch,Thetford,FortSterling,Martlock,Bridgewatch,Lymhurst'
url = f"https://www.albion-online-data.com/api/v1/stats/Prices/{item}?locations={location}"
# you also don't need these parentheses
response = requests.get(url).text
response_json = json.loads(response)
if response_json != []:
response_json = pd.DataFrame(response_json)[["item_id", "city", "sell_price_min"]]
max = response_json.loc[response_json["sell_price_min"].idxmax()]
min = response_json.loc[response_json["sell_price_min"].idxmin()]
gain = max[2] - min[2]
data = pd.DataFrame(
[[max[0], max[1], min[1], min[2], max[2], gain]],
columns=["ITEM", "CITY max ", "CITY min", "MIN_PRICE", "MAX PRICE", "GAIN"])
return data
This minimizes the cost of constructing the class just to get a dataframe.
Checking the Value of json.loads
Now, I’m not completely sure what response.text
will look like every time, but reading your code, it will always be a list
. The fact that there’s no else
means that if your code doesn’t satisfy the if
statement, it will return None
, which could lead to TypeErrors
in your dataframe construction. It might be easier to make a generator, that way you can easily skip return values:
# set the argument to be an iterable
def get_data(items):
columns=["ITEM", "CITY max ", "CITY min", "MIN_PRICE", "MAX PRICE", "GAIN"])
location = f'Bridgewatch,Thetford,FortSterling,Martlock,Bridgewatch,Lymhurst'
for item in items:
url = f"https://www.albion-online-data.com/api/v1/stats/Prices/{item}?locations={location}"
response = requests.get(url).text
response_json = json.loads(response)
# rely on the falsiness of empty lists and enforce that it will indeed be
# a list type using isinstance
if response_json and isinstance(response_json, list):
# since response_json is a list, don't create a dataframe,
# you can instead provide a key to the max and min functions
maxval = max(response_json, key=lambda x: x['sell_price_min'])
minval = min(response_json, key=lambda x: x['sell_price_min'])
# Now you don't need to rely on an index
gain = maxval['sell_price_min'] - minval['sell_price_min']
vals = [maxval.pop('item_id')]
for k in ('city','sell_price_min'):
vals.append(maxval[k])
vals.append(minval[k])
vals.append(gain)
yield dict(zip(vals, columns))
Now, in case the if
statement isn’t True
, you won’t get None
from the function:
def func(a):
if a<2:
return [1,2]
df = pd.DataFrame([func(x) for x in range(4)], columns = ['a', 'b'])
# TypeError: object of type 'NoneType' has no len()
The benefit here is that now your get_data_frame
function can be a lot easier to leverage:
def get_data_frame():
item_list = pd.read_csv('items.txt', sep=':', names=['nuber', 'item']['item']
# You also don't need to initialize the item_price dataframe
# You can use a list comprehension, and because your items are
# dictionaries, you don't need to specify the columns
item_price = pd.DataFrame([a for a in get_data(item_list)])
item_price.to_csv('data.csv')
return item_price
read_csv
Now, the only other issue I can see is that you are collecting everything into memory multiple times with pd.read_csv
. You really only want the item_id’s, so just iterate over a file handle directly:
def file_iterate():
with open('items.txt') as fh:
for line in fh:
# item is the second entry, so split on ':' and grab index 1
yield line.split(':')[1].strip()
def get_data_frame():
items = file_iterate()
item_price = pd.DataFrame([a for a in get_data(items)])
item_price.to_csv('data.csv')
return item_price
The generators will allow you to keep one dataframe in-memory, and produces values one at a time.
try/except
The last thing I’d do is to ensure the program doesn’t crash on a bad web request or attempt at json.loads
:
from requests.exceptions import HTTPError
def get_data(items):
columns=["ITEM", "CITY max ", "CITY min", "MIN_PRICE", "MAX PRICE", "GAIN"])
location = f'Bridgewatch,Thetford,FortSterling,Martlock,Bridgewatch,Lymhurst'
for item in items:
url = f"https://www.albion-online-data.com/api/v1/stats/Prices/{item}?locations={location}"
# wrap in try/except here
try:
response = requests.get(url).text
response_json = json.loads(response)
except JSONDecodeError as e:
print(f'bad json for {item}')
continue # skip it so it doesn't ruin your process, or you can raise it
except HTTPError as e:
print(f'HTTPError on {url}'
continue # skip or you can raise
except Exception: # Unexpected error
raise
# rest of function