'Scraping the English Vivino.com reviews from the website

I have two questions about web scraping information from Vivino.com: 1.) With the code below I can scrape information and reviews from the Vivino website, however I would like to get the reviews in English or otherwise only the English reviews. Is there a way to do this? 2.) At the moment I am only scraping wines from Portugal, but I want wines from different countries. If I remove 'country_codes[] : 'pt' the code is not working anymore. How can I solve this?

Can someone help me? Thank you very much!

#!/usr/bin/env python
# coding: utf-8

# Import packages
import requests
import json
import pandas as pd

# Get request from the Vivino website
def get_wine_data(wine_id, year, page):
    headers = {
        "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:89.0) Gecko/20100101 Firefox/89.0",
    }

    api_url = f"https://www.vivino.com/api/wines/{wine_id}/reviews?per_page=50&year={year}&page={page}" 
    print(api_url)

    data = requests.get(api_url, headers=headers).json()

    return data

# Get request from the Vivino website
r = requests.get(
    "https://www.vivino.com/api/explore/explore",
    params={
        "country_code": "FR",
        "country_codes[]": "pt",
        "currency_code": "EUR",
        "grape_filter": "varietal",
        "min_rating": "1",
        "order_by": "price",
        "order": "asc",
        "page": 1,
        "price_range_max": "500",
        "price_range_min": "0",
        "wine_type_ids[]": "1",
    },
    headers={
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0"
    },
)

# Variables to __scrape__ from the Vivino website
results = [
    (
        t["vintage"]["wine"]["winery"]["name"],
        t["vintage"]["year"],
        t["vintage"]["wine"]["id"],
        f'{t["vintage"]["wine"]["name"]} {t["vintage"]["year"]}',
        t["vintage"]["statistics"]["ratings_average"],
        t["vintage"]["statistics"]["ratings_count"],
        t["prices"][0]["amount"],
        t["vintage"]["wine"]["style"]["acidity"],
        t["vintage"]["wine"]["style"]["blurb"],
        t["vintage"]["wine"]["style"]["body"],
        t["vintage"]["wine"]["style"]["body_description"],
        t['vintage']['wine']['region']['country']['name'],
        t['vintage']['wine']['style']['description'],
        t['vintage']['wine']['style']['food'][0]['name'],
        t['vintage']['wine']['style']['food'][1]['name'],
        t['vintage']['wine']['style']['food'][2]['name'],
        t['vintage']['wine']['style']['food'][3]['name'],
        t['vintage']['wine']['style']['country']['most_used_grapes'][0]['name'],
        t['vintage']['wine']['region']['country']['most_used_grapes'][1]['name'],
        t['vintage']['wine']['region']['country']['most_used_grapes'][2]['name'],
        t['vintage']['wine']['taste']['structure']['acidity'],
        t['vintage']['wine']['taste']['structure']['calculated_structure_count'],
        t['vintage']['wine']['taste']['structure']['intensity'],
        t['vintage']['wine']['taste']['structure']['sweetness'],
        t['vintage']['wine']['taste']['structure']['tannin'],
        t['vintage']['wine']['taste']['structure']['user_structure_count'],
        t['vintage']['wine']['taste']['flavor'][0]['group'],
        t['vintage']['wine']['taste']['flavor'][0]['stats']['count'],
        t['vintage']['wine']['taste']['flavor'][1]['group'],
        t['vintage']['wine']['taste']['flavor'][1]['stats']['count'],
        t['vintage']['wine']['taste']['flavor'][2]['group'],
        t['vintage']['wine']['taste']['flavor'][2]['stats']['count'],
        t['vintage']['wine']['taste']['flavor'][3]['group'],
        t['vintage']['wine']['taste']['flavor'][3]['stats']['count'],
        t['vintage']['wine']['taste']['flavor'][4]['group'],
        t['vintage']['wine']['taste']['flavor'][4]['stats']['count'],
        t['vintage']['wine']['taste']['flavor'][5]['group'],
        t['vintage']['wine']['taste']['flavor'][5]['stats']['count'],
        t['vintage']['wine']['taste']['flavor'][6]['group'],
        t['vintage']['wine']['taste']['flavor'][6]['stats']['count']
    )
    for t in r.json()["explore_vintage"]["matches"]
]

###
for t in r.json()["explore_vintage"]["matches"][0:2]:
    wine_id = t["vintage"]["wine"]["id"]
    year = t["vintage"]["year"],
    
    with open(f'output-wine{wine_id}-year{year}.json', 'w+') as f:
        json.dump(t, f, indent=4, sort_keys=True)
###

# Saving the results in a dataframe
dataframe = pd.DataFrame(
    results,
    columns=["Winery", "Year", "Wine ID", "Wine", "Rating", "num_review", "price", "acidity",'Blurb','Body','Body_des','country','wine_des','food_1','food_2','food_3','food_4','grape_1','grape_2','grape_3','acidity_score','calculated_structure_count','intensity','sweetness','tannin','user_structure_count', 'flavor_1', 'flavor_1_count','flavor_2', 'flavor_2_count','flavor_3', 'flavor_3_count','flavor_4', 'flavor_4_count','flavor_5', 'flavor_5_count','flavor_6', 'flavor_6_count','flavor_7', 'flavor_7_count'],
             
)

# Scraping the reviews from the Vivino website
ratings = []
for _, row in dataframe.iterrows(): ######## Ik heb hier head(2) toegevoegd om te testen zodat hij maar twee wijnen doet
    page = 1
    while True:
        print(
            f'Getting info about wine {row["Wine ID"]}-{row["Year"]} Page {page}'
        )

        d = get_wine_data(row["Wine ID"], row["Year"], page)

        if not d["reviews"]:
            break

        for r in d["reviews"]:
            ratings.append(
                [
                    row["Year"],
                    row["Wine ID"],
                    r["rating"],
                    r["note"],
                    r["created_at"],
                ]
            )

        page += 1

ratings = pd.DataFrame(
    ratings, columns=["Year", "Wine ID", "User Rating", "Note", "CreatedAt"]
)

# Merging the two datasets; results and ratings.
df_out = ratings.merge(dataframe)
df_out.to_csv("data.csv", index=False)


Solution 1:[1]

  1. To get only english reviews, you can filter by "language" key. English reviews has it set to en

  2. To get wines from more countries you can put the codes inside "country_codes[]" key. Fore example ["pt", "es", "fr"]

# Import packages
import requests
import json
import pandas as pd

# Get request from the Vivino website
def get_wine_data(wine_id, year, page):
    headers = {
        "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:89.0) Gecko/20100101 Firefox/89.0",
    }

    api_url = f"https://www.vivino.com/api/wines/{wine_id}/reviews?per_page=50&year={year}&page={page}"
    print(api_url)

    data = requests.get(api_url, headers=headers).json()

    return data


# Get request from the Vivino website
r = requests.get(
    "https://www.vivino.com/api/explore/explore",
    params={
        "country_codes[]": ["pt", "es", "fr"],  # <-- put more country codes here
        "currency_code": "EUR",
        "grape_filter": "varietal",
        "min_rating": "1",
        "order_by": "price",
        "order": "asc",
        "page": 1,
        "price_range_max": "500",
        "price_range_min": "0",
        "wine_type_ids[]": "1",
    },
    headers={
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0"
    },
)

# Variables to scrape from the Vivino website
results = [
    (
        t["vintage"]["wine"]["winery"]["name"],
        t["vintage"]["year"],
        t["vintage"]["wine"]["id"],
        f'{t["vintage"]["wine"]["name"]} {t["vintage"]["year"]}',
    )
    for t in r.json()["explore_vintage"]["matches"]
]

# Saving the results in a dataframe
dataframe = pd.DataFrame(
    results,
    columns=["Winery", "Year", "Wine ID", "Wine"],
)

# Scraping the reviews from the Vivino website
ratings = []

for _, row in dataframe.iterrows():
    page = 1
    while True:
        print(
            f'Getting info about wine {row["Wine ID"]}-{row["Year"]} Page {page}'
        )

        d = get_wine_data(row["Wine ID"], row["Year"], page)

        if not d["reviews"]:
            break

        for r in d["reviews"]:
            if r["language"] != "en": # <-- get only english reviews
                continue

            ratings.append(
                [
                    row["Year"],
                    row["Wine ID"],
                    r["rating"],
                    r["note"],
                    r["created_at"],
                ]
            )

        page += 1


ratings = pd.DataFrame(
    ratings, columns=["Year", "Wine ID", "User Rating", "Note", "CreatedAt"]
)

# Merging the two datasets; results and ratings.
df_out = ratings.merge(dataframe)
df_out.to_csv("data.csv", index=False)

Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution Source
Solution 1 DisappointedByUnaccountableMod