Jazmin Barrionuevo

Rio de Janeiro Data Analysis

2022

2-minute read

Python

Matplotlib

csv

Seaborn

The focus of this analysis is on Rio de Janeiro, utilizing neighborhood information, prices, and rental types to answer the following questions:

Average price per rental type in Rio de Janeiro:

Implemented a function precio_promedio_por_tipo_de_alquiler() that calculates the average price for each rental type. The results are stored in a dictionary where the rental type serves as the key, and the average price as the value.

For the analysis of the first problem, we aim to determine the average price per rental type, and this information is stored in a dictionary where each rental type is associated with its average price. The process involves summing the prices for each rental type and then calculating the average.

import csv
from statistics import mean
import matplotlib.pyplot as plt
import seaborn as sns

def average_price_by_room_type(filename):
    prices = {'Entire home/apt': [], 'Private room': [], 'Hotel room': [], 'Shared room': []}
    
    with open(filename, newline='') as file:
        reader = csv.DictReader(file)
        for row in reader:
            room_type = row['room_type']
            price = float(row['price'])
            if room_type in prices:
                prices[room_type].append(price)
    
    return {room_type: mean(price_list) if price_list else 0 
            for room_type, price_list in prices.items()}

def plot_average_prices(averages):
    sns.set(style="whitegrid")
    plt.figure(figsize=(15, 6))
    plt.bar(averages.keys(), averages.values())
    plt.xticks(rotation=45, ha='right')
    plt.xlabel('Room Type', fontsize=15)
    plt.ylabel('Average Price', fontsize=15)
    plt.title('Average Price by Room Type', fontsize=20)
    plt.tight_layout()
    plt.savefig('average_prices.png')
    plt.show()

def main():
    filename = "listings.csv"
    averages = average_price_by_room_type(filename)
    print("Average prices by room type:")
    for room_type, avg_price in averages.items():
        print(f"{room_type}: $ {avg_price:.2f}")
    
    plot_average_prices(averages)

if __name__ == "__main__":
    main()

Number of rental types per neighborhood:

The function cantidad_de_tipos_por_barrio() returns a dictionary, tipo_de_alquiler_por_barrio, mapping neighborhoods to the count of each rental type available in that neighborhood.

The second analysis is addressed by the function cantidad_de_tipos_por_barrio(), which returns a dictionary named tipo_de_alquiler_por_barrio. This dictionary has neighborhoods as keys and another dictionary as values. The inner dictionary has rental types as keys and the count of each type in that neighborhood as values.

import csv
from collections import defaultdict
import matplotlib.pyplot as plt
import seaborn as sns

def load_data(filename):
    data = []
    with open(filename, newline='') as file:
        reader = csv.DictReader(file)
        for row in reader:
            data.append({
                'neighbourhood': row['neighbourhood'],
                'room_type': row['room_type'],
                'price': int(row['price'])
            })
    return data

def count_types_by_neighborhood(data):
    counts = defaultdict(lambda: defaultdict(int))
    for item in data:
        counts[item['neighbourhood']][item['room_type']] += 1
    return dict(counts)

def average_price_by_neighborhood_and_type(data):
    totals = defaultdict(lambda: defaultdict(lambda: [0, 0]))
    for item in data:
        neighborhood = item['neighbourhood']
        room_type = item['room_type']
        price = item['price']
        totals[neighborhood][room_type][0] += price
        totals[neighborhood][room_type][1] += 1
    
    averages = {}
    for neighborhood, types in totals.items():
        averages[neighborhood] = {room_type: total / count if count else 0 
                                  for room_type, (total, count) in types.items()}
    return averages

def get_urca_prices(data):
    return [item['price'] for item in data if item['neighbourhood'] == 'Urca']

def plot_room_types(data, neighborhood):
    counts = count_types_by_neighborhood(data)
    if neighborhood not in counts:
        print(f"Neighborhood '{neighborhood}' not found.")
        return
    
    room_types = counts[neighborhood]
    plt.figure(figsize=(15, 6))
    plt.bar(room_types.keys(), room_types.values())
    plt.xticks(rotation=90)
    plt.xlabel("Room Types", fontsize=15)
    plt.ylabel('Number of Listings', fontsize=15)
    plt.title(neighborhood, fontsize=20)
    plt.tight_layout()
    plt.savefig('room_types.png')
    plt.show()

def main():
    data = load_data("listings.csv")
    
    counts = count_types_by_neighborhood(data)
    print("Counts by neighborhood and room type:")
    print(counts)
    
    averages = average_price_by_neighborhood_and_type(data)
    print("
Average prices by neighborhood and room type:")
    print(averages)
    
    urca_prices = get_urca_prices(data)
    print("
Prices in Urca neighborhood:", urca_prices)
    
    sns.set()
    neighborhood = input("Choose a neighborhood to plot: ")
    plot_room_types(data, neighborhood)

if __name__ == "__main__":
    main()

Number of listings per neighborhood:

The function cantidad_anuncios_barrio() calculates the total number of listings for each neighborhood and returns a dictionary with neighborhoods as keys and the corresponding number of listings as values.

The third analysis is the function cantidad_anuncios_barrio(), which returns a dictionary named cantidad_anuncios_barrio. It maps each neighborhood to the total number of listings in that neighborhood.

import csv
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns

def count_listings_by_neighborhood(filename):
    neighborhood_counts = Counter()
    
    with open(filename, newline='') as file:
        reader = csv.DictReader(file)
        for row in reader:
            neighborhood_counts[row['neighbourhood']] += 1
    
    return dict(neighborhood_counts)

def plot_listings_by_neighborhood(counts, start_range, end_range, exclude_neighborhood=None):
    if exclude_neighborhood:
        counts = {k: v for k, v in counts.items() if k != exclude_neighborhood}
    
    neighborhoods = list(counts.keys())[start_range:end_range]
    listing_counts = [counts[neighborhood] for neighborhood in neighborhoods]
    
    sns.set(style="whitegrid")
    plt.figure(figsize=(30, 6))
    plt.bar(neighborhoods, listing_counts)
    plt.xticks(rotation=90, fontsize=10)
    plt.yticks(fontsize=10)
    plt.xlabel('Neighborhood', fontsize=15)
    plt.ylabel('Number of Listings', fontsize=15)
    title = 'Number of Listings by Neighborhood'
    if exclude_neighborhood:
        title += f' (Excluding {exclude_neighborhood})'
    plt.title(title, fontsize=20)
    
    # Adjust y-axis limit based on the range
    if start_range > 30:
        plt.ylim(0, 500)
    elif start_range > 10:
        plt.ylim(0, 1500)
    elif start_range > 1:
        plt.ylim(0, 3000)
    
    plt.tight_layout()
    plt.savefig('listings_by_neighborhood.png', bbox_inches='tight')
    plt.show()

def main():
    filename = "listings.csv"
    neighborhood_counts = count_listings_by_neighborhood(filename)
    print("Number of listings by neighborhood:")
    for neighborhood, count in neighborhood_counts.items():
        print(f"{neighborhood}: {count}")
    
    start_range = int(input("Enter the starting index for the range of neighborhoods to display (0-153): "))
    end_range = int(input("Enter the ending index for the range of neighborhoods to display (0-153): "))
    
    plot_listings_by_neighborhood(neighborhood_counts, start_range, end_range)
    
    # Plot without Copacabana
    plot_listings_by_neighborhood(neighborhood_counts, start_range, end_range, exclude_neighborhood="Copacabana")

if __name__ == "__main__":
    main()

The dataset used for this analysis can be found here . I have improved and optimized the code but here you can find the original file.