'Scraping the English Vivino.com reviews from the website
I have two questions about web scraping information from Vivino.com: 1.) With the code below I can scrape information and reviews from the Vivino website, however I would like to get the reviews in English or otherwise only the English reviews. Is there a way to do this? 2.) At the moment I am only scraping wines from Portugal, but I want wines from different countries. If I remove 'country_codes[] : 'pt' the code is not working anymore. How can I solve this?
Can someone help me? Thank you very much!
#!/usr/bin/env python
# coding: utf-8
# Import packages
import requests
import json
import pandas as pd
# Get request from the Vivino website
def get_wine_data(wine_id, year, page):
headers = {
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:89.0) Gecko/20100101 Firefox/89.0",
}
api_url = f"https://www.vivino.com/api/wines/{wine_id}/reviews?per_page=50&year={year}&page={page}"
print(api_url)
data = requests.get(api_url, headers=headers).json()
return data
# Get request from the Vivino website
r = requests.get(
"https://www.vivino.com/api/explore/explore",
params={
"country_code": "FR",
"country_codes[]": "pt",
"currency_code": "EUR",
"grape_filter": "varietal",
"min_rating": "1",
"order_by": "price",
"order": "asc",
"page": 1,
"price_range_max": "500",
"price_range_min": "0",
"wine_type_ids[]": "1",
},
headers={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0"
},
)
# Variables to __scrape__ from the Vivino website
results = [
(
t["vintage"]["wine"]["winery"]["name"],
t["vintage"]["year"],
t["vintage"]["wine"]["id"],
f'{t["vintage"]["wine"]["name"]} {t["vintage"]["year"]}',
t["vintage"]["statistics"]["ratings_average"],
t["vintage"]["statistics"]["ratings_count"],
t["prices"][0]["amount"],
t["vintage"]["wine"]["style"]["acidity"],
t["vintage"]["wine"]["style"]["blurb"],
t["vintage"]["wine"]["style"]["body"],
t["vintage"]["wine"]["style"]["body_description"],
t['vintage']['wine']['region']['country']['name'],
t['vintage']['wine']['style']['description'],
t['vintage']['wine']['style']['food'][0]['name'],
t['vintage']['wine']['style']['food'][1]['name'],
t['vintage']['wine']['style']['food'][2]['name'],
t['vintage']['wine']['style']['food'][3]['name'],
t['vintage']['wine']['style']['country']['most_used_grapes'][0]['name'],
t['vintage']['wine']['region']['country']['most_used_grapes'][1]['name'],
t['vintage']['wine']['region']['country']['most_used_grapes'][2]['name'],
t['vintage']['wine']['taste']['structure']['acidity'],
t['vintage']['wine']['taste']['structure']['calculated_structure_count'],
t['vintage']['wine']['taste']['structure']['intensity'],
t['vintage']['wine']['taste']['structure']['sweetness'],
t['vintage']['wine']['taste']['structure']['tannin'],
t['vintage']['wine']['taste']['structure']['user_structure_count'],
t['vintage']['wine']['taste']['flavor'][0]['group'],
t['vintage']['wine']['taste']['flavor'][0]['stats']['count'],
t['vintage']['wine']['taste']['flavor'][1]['group'],
t['vintage']['wine']['taste']['flavor'][1]['stats']['count'],
t['vintage']['wine']['taste']['flavor'][2]['group'],
t['vintage']['wine']['taste']['flavor'][2]['stats']['count'],
t['vintage']['wine']['taste']['flavor'][3]['group'],
t['vintage']['wine']['taste']['flavor'][3]['stats']['count'],
t['vintage']['wine']['taste']['flavor'][4]['group'],
t['vintage']['wine']['taste']['flavor'][4]['stats']['count'],
t['vintage']['wine']['taste']['flavor'][5]['group'],
t['vintage']['wine']['taste']['flavor'][5]['stats']['count'],
t['vintage']['wine']['taste']['flavor'][6]['group'],
t['vintage']['wine']['taste']['flavor'][6]['stats']['count']
)
for t in r.json()["explore_vintage"]["matches"]
]
###
for t in r.json()["explore_vintage"]["matches"][0:2]:
wine_id = t["vintage"]["wine"]["id"]
year = t["vintage"]["year"],
with open(f'output-wine{wine_id}-year{year}.json', 'w+') as f:
json.dump(t, f, indent=4, sort_keys=True)
###
# Saving the results in a dataframe
dataframe = pd.DataFrame(
results,
columns=["Winery", "Year", "Wine ID", "Wine", "Rating", "num_review", "price", "acidity",'Blurb','Body','Body_des','country','wine_des','food_1','food_2','food_3','food_4','grape_1','grape_2','grape_3','acidity_score','calculated_structure_count','intensity','sweetness','tannin','user_structure_count', 'flavor_1', 'flavor_1_count','flavor_2', 'flavor_2_count','flavor_3', 'flavor_3_count','flavor_4', 'flavor_4_count','flavor_5', 'flavor_5_count','flavor_6', 'flavor_6_count','flavor_7', 'flavor_7_count'],
)
# Scraping the reviews from the Vivino website
ratings = []
for _, row in dataframe.iterrows(): ######## Ik heb hier head(2) toegevoegd om te testen zodat hij maar twee wijnen doet
page = 1
while True:
print(
f'Getting info about wine {row["Wine ID"]}-{row["Year"]} Page {page}'
)
d = get_wine_data(row["Wine ID"], row["Year"], page)
if not d["reviews"]:
break
for r in d["reviews"]:
ratings.append(
[
row["Year"],
row["Wine ID"],
r["rating"],
r["note"],
r["created_at"],
]
)
page += 1
ratings = pd.DataFrame(
ratings, columns=["Year", "Wine ID", "User Rating", "Note", "CreatedAt"]
)
# Merging the two datasets; results and ratings.
df_out = ratings.merge(dataframe)
df_out.to_csv("data.csv", index=False)
Solution 1:[1]
To get only english reviews, you can filter by
"language"
key. English reviews has it set toen
To get wines from more countries you can put the codes inside
"country_codes[]"
key. Fore example["pt", "es", "fr"]
# Import packages
import requests
import json
import pandas as pd
# Get request from the Vivino website
def get_wine_data(wine_id, year, page):
headers = {
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:89.0) Gecko/20100101 Firefox/89.0",
}
api_url = f"https://www.vivino.com/api/wines/{wine_id}/reviews?per_page=50&year={year}&page={page}"
print(api_url)
data = requests.get(api_url, headers=headers).json()
return data
# Get request from the Vivino website
r = requests.get(
"https://www.vivino.com/api/explore/explore",
params={
"country_codes[]": ["pt", "es", "fr"], # <-- put more country codes here
"currency_code": "EUR",
"grape_filter": "varietal",
"min_rating": "1",
"order_by": "price",
"order": "asc",
"page": 1,
"price_range_max": "500",
"price_range_min": "0",
"wine_type_ids[]": "1",
},
headers={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0"
},
)
# Variables to scrape from the Vivino website
results = [
(
t["vintage"]["wine"]["winery"]["name"],
t["vintage"]["year"],
t["vintage"]["wine"]["id"],
f'{t["vintage"]["wine"]["name"]} {t["vintage"]["year"]}',
)
for t in r.json()["explore_vintage"]["matches"]
]
# Saving the results in a dataframe
dataframe = pd.DataFrame(
results,
columns=["Winery", "Year", "Wine ID", "Wine"],
)
# Scraping the reviews from the Vivino website
ratings = []
for _, row in dataframe.iterrows():
page = 1
while True:
print(
f'Getting info about wine {row["Wine ID"]}-{row["Year"]} Page {page}'
)
d = get_wine_data(row["Wine ID"], row["Year"], page)
if not d["reviews"]:
break
for r in d["reviews"]:
if r["language"] != "en": # <-- get only english reviews
continue
ratings.append(
[
row["Year"],
row["Wine ID"],
r["rating"],
r["note"],
r["created_at"],
]
)
page += 1
ratings = pd.DataFrame(
ratings, columns=["Year", "Wine ID", "User Rating", "Note", "CreatedAt"]
)
# Merging the two datasets; results and ratings.
df_out = ratings.merge(dataframe)
df_out.to_csv("data.csv", index=False)
Sources
This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.
Source: Stack Overflow
Solution | Source |
---|---|
Solution 1 | DisappointedByUnaccountableMod |