Introduction to Pandas
Pandas is a powerful Python library for data manipulation and analysis. It provides data structures for efficiently storing and manipulating large datasets, as well as tools for reading and writing data in different formats. In this lesson, we'll explore the basics of Pandas and how to use it for data analysis.
What is Pandas?
Pandas is built on top of NumPy and provides two primary data structures:
- Series: A one-dimensional labeled array capable of holding any data type
- DataFrame: A two-dimensional labeled data structure with columns of potentially different types
Pandas is particularly well-suited for:
- Working with tabular data (like spreadsheets or SQL tables)
- Time series data
- Any labeled data that needs to be manipulated
- Statistical data analysis
Installing Pandas
You can install Pandas using pip:
pip install pandas
Or using conda:
conda install pandas
Importing Pandas
The conventional way to import Pandas is:
import pandas as pd
Series
A Series is a one-dimensional labeled array capable of holding any data type. It's similar to a column in a spreadsheet or a single variable in a dataset.
Creating a Series
import pandas as pd
import numpy as np
# From a list
s = pd.Series([1, 3, 5, np.nan, 6, 8])
print(s)
# 0 1.0
# 1 3.0
# 2 5.0
# 3 NaN
# 4 6.0
# 5 8.0
# dtype: float64
# With custom index
s = pd.Series([1, 3, 5, 7], index=['a', 'b', 'c', 'd'])
print(s)
# a 1
# b 3
# c 5
# d 7
# dtype: int64
# From a dictionary
d = {'a': 1, 'b': 2, 'c': 3}
s = pd.Series(d)
print(s)
# a 1
# b 2
# c 3
# dtype: int64
Accessing Series Elements
s = pd.Series([1, 3, 5, 7], index=['a', 'b', 'c', 'd'])
# By index label
print(s['a']) # 1
print(s[['a', 'c']])
# a 1
# c 5
# dtype: int64
# By position
print(s[0]) # 1
print(s[1:3])
# b 3
# c 5
# dtype: int64
# Boolean indexing
print(s[s > 3])
# c 5
# d 7
# dtype: int64
DataFrame
A DataFrame is a two-dimensional labeled data structure with columns of potentially different types. It's similar to a spreadsheet, SQL table, or a dictionary of Series objects.
Creating a DataFrame
import pandas as pd
import numpy as np
# From a dictionary of lists
data = {
'Name': ['Alice', 'Bob', 'Charlie', 'David'],
'Age': [25, 30, 35, 40],
'City': ['New York', 'London', 'Paris', 'Tokyo']
}
df = pd.DataFrame(data)
print(df)
# Name Age City
# 0 Alice 25 New York
# 1 Bob 30 London
# 2 Charlie 35 Paris
# 3 David 40 Tokyo
# From a list of dictionaries
data = [
{'Name': 'Alice', 'Age': 25, 'City': 'New York'},
{'Name': 'Bob', 'Age': 30, 'City': 'London'},
{'Name': 'Charlie', 'Age': 35, 'City': 'Paris'}
]
df = pd.DataFrame(data)
print(df)
# Name Age City
# 0 Alice 25 New York
# 1 Bob 30 London
# 2 Charlie 35 Paris
# From a NumPy array
data = np.random.randn(3, 3)
df = pd.DataFrame(data, columns=['A', 'B', 'C'], index=['X', 'Y', 'Z'])
print(df)
# A B C
# X -0.264438 -0.363730 0.292977
# Y 0.144044 1.454274 0.761038
# Z 0.121675 0.443863 0.333674
Accessing DataFrame Elements
data = {
'Name': ['Alice', 'Bob', 'Charlie', 'David'],
'Age': [25, 30, 35, 40],
'City': ['New York', 'London', 'Paris', 'Tokyo']
}
df = pd.DataFrame(data)
# Accessing columns
print(df['Name'])
# 0 Alice
# 1 Bob
# 2 Charlie
# 3 David
# Name: Name, dtype: object
print(df[['Name', 'Age']])
# Name Age
# 0 Alice 25
# 1 Bob 30
# 2 Charlie 35
# 3 David 40
# Accessing rows
print(df.loc[0])
# Name Alice
# Age 25
# City New York
# Name: 0, dtype: object
print(df.loc[1:2])
# Name Age City
# 1 Bob 30 London
# 2 Charlie 35 Paris
# Accessing specific cells
print(df.loc[0, 'Name']) # Alice
print(df.iloc[0, 0]) # Alice
# Boolean indexing
print(df[df['Age'] > 30])
# Name Age City
# 2 Charlie 35 Paris
# 3 David 40 Tokyo
Reading and Writing Data
Pandas provides functions to read and write data in various formats.
Reading Data
# Reading CSV
df = pd.read_csv('data.csv')
# Reading Excel
df = pd.read_excel('data.xlsx', sheet_name='Sheet1')
# Reading JSON
df = pd.read_json('data.json')
# Reading SQL
import sqlite3
conn = sqlite3.connect('database.db')
df = pd.read_sql('SELECT * FROM table_name', conn)
Writing Data
# Writing to CSV
df.to_csv('output.csv', index=False)
# Writing to Excel
df.to_excel('output.xlsx', sheet_name='Sheet1', index=False)
# Writing to JSON
df.to_json('output.json')
# Writing to SQL
import sqlite3
conn = sqlite3.connect('database.db')
df.to_sql('table_name', conn, if_exists='replace', index=False)
Data Manipulation
Pandas provides many functions for manipulating data.
Basic Information
data = {
'Name': ['Alice', 'Bob', 'Charlie', 'David'],
'Age': [25, 30, 35, 40],
'City': ['New York', 'London', 'Paris', 'Tokyo']
}
df = pd.DataFrame(data)
# Basic information about the DataFrame
print(df.shape) # (4, 3) - (rows, columns)
print(df.dtypes)
# Name object
# Age int64
# City object
# dtype: object
print(df.info())
#
# RangeIndex: 4 entries, 0 to 3
# Data columns (total 3 columns):
# # Column Non-Null Count Dtype
# --- ------ -------------- -----
# 0 Name 4 non-null object
# 1 Age 4 non-null int64
# 2 City 4 non-null object
# dtypes: int64(1), object(2)
# memory usage: 224.0+ bytes
print(df.describe())
# Age
# count 4.000000
# mean 32.500000
# std 6.454972
# min 25.000000
# 25% 28.750000
# 50% 32.500000
# 75% 36.250000
# max 40.000000
Handling Missing Data
import numpy as np
data = {
'Name': ['Alice', 'Bob', 'Charlie', np.nan],
'Age': [25, np.nan, 35, 40],
'City': ['New York', 'London', np.nan, 'Tokyo']
}
df = pd.DataFrame(data)
print(df)
# Name Age City
# 0 Alice 25.0 New York
# 1 Bob NaN London
# 2 Charlie 35.0 NaN
# 3 NaN 40.0 Tokyo
# Check for missing values
print(df.isnull())
# Name Age City
# 0 False False False
# 1 False True False
# 2 False False True
# 3 True False False
print(df.isnull().sum())
# Name 1
# Age 1
# City 1
# dtype: int64
# Drop rows with missing values
print(df.dropna())
# Name Age City
# 0 Alice 25.0 New York
# Drop columns with missing values
print(df.dropna(axis=1))
# Empty DataFrame
# Columns: []
# Index: [0, 1, 2, 3]
# Fill missing values
print(df.fillna({'Name': 'Unknown', 'Age': 0, 'City': 'Unknown'}))
# Name Age City
# 0 Alice 25.0 New York
# 1 Bob 0.0 London
# 2 Charlie 35.0 Unknown
# 3 Unknown 40.0 Tokyo
Filtering and Sorting
data = {
'Name': ['Alice', 'Bob', 'Charlie', 'David'],
'Age': [25, 30, 35, 40],
'City': ['New York', 'London', 'Paris', 'Tokyo']
}
df = pd.DataFrame(data)
# Filtering
print(df[df['Age'] > 30])
# Name Age City
# 2 Charlie 35 Paris
# 3 David 40 Tokyo
print(df[(df['Age'] > 25) & (df['Age'] < 40)])
# Name Age City
# 1 Bob 30 London
# 2 Charlie 35 Paris
# Sorting
print(df.sort_values('Age'))
# Name Age City
# 0 Alice 25 New York
# 1 Bob 30 London
# 2 Charlie 35 Paris
# 3 David 40 Tokyo
print(df.sort_values('Name', ascending=False))
# Name Age City
# 3 David 40 Tokyo
# 2 Charlie 35 Paris
# 1 Bob 30 London
# 0 Alice 25 New York
Grouping and Aggregation
data = {
'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Alice', 'Bob'],
'Department': ['HR', 'IT', 'Finance', 'HR', 'Finance', 'IT'],
'Salary': [50000, 60000, 70000, 55000, 65000, 75000]
}
df = pd.DataFrame(data)
print(df)
# Name Department Salary
# 0 Alice HR 50000
# 1 Bob IT 60000
# 2 Charlie Finance 70000
# 3 David HR 55000
# 4 Alice Finance 65000
# 5 Bob IT 75000
# Group by one column
grouped = df.groupby('Department')
print(grouped.mean())
# Salary
# Department
# Finance 67500
# HR 52500
# IT 67500
# Group by multiple columns
grouped = df.groupby(['Department', 'Name'])
print(grouped.mean())
# Salary
# Department Name
# Finance Alice 65000
# Charlie 70000
# HR Alice 50000
# David 55000
# IT Bob 67500
# Aggregation
print(df.groupby('Department').agg({
'Salary': ['min', 'max', 'mean', 'sum']
}))
# Salary
# min max mean sum
# Department
# Finance 65000 70000 67500 135000
# HR 50000 55000 52500 105000
# IT 60000 75000 67500 135000
Merging and Joining
# Create two DataFrames
df1 = pd.DataFrame({
'ID': [1, 2, 3, 4],
'Name': ['Alice', 'Bob', 'Charlie', 'David']
})
print(df1)
# ID Name
# 0 1 Alice
# 1 2 Bob
# 2 3 Charlie
# 3 4 David
df2 = pd.DataFrame({
'ID': [1, 2, 3, 5],
'Salary': [50000, 60000, 70000, 80000]
})
print(df2)
# ID Salary
# 0 1 50000
# 1 2 60000
# 2 3 70000
# 3 5 80000
# Inner join (default)
merged = pd.merge(df1, df2, on='ID')
print(merged)
# ID Name Salary
# 0 1 Alice 50000
# 1 2 Bob 60000
# 2 3 Charlie 70000
# Left join
merged = pd.merge(df1, df2, on='ID', how='left')
print(merged)
# ID Name Salary
# 0 1 Alice 50000
# 1 2 Bob 60000
# 2 3 Charlie 70000
# 3 4 David NaN
# Right join
merged = pd.merge(df1, df2, on='ID', how='right')
print(merged)
# ID Name Salary
# 0 1 Alice 50000
# 1 2 Bob 60000
# 2 3 Charlie 70000
# 3 5 NaN 80000
# Outer join
merged = pd.merge(df1, df2, on='ID', how='outer')
print(merged)
# ID Name Salary
# 0 1 Alice 50000
# 1 2 Bob 60000
# 2 3 Charlie 70000
# 3 4 David NaN
# 4 5 NaN 80000
Data Visualization with Pandas
Pandas integrates with Matplotlib to provide basic plotting capabilities.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# Create a DataFrame with random data
df = pd.DataFrame(np.random.randn(10, 4), columns=['A', 'B', 'C', 'D'])
# Line plot
df.plot()
plt.title('Line Plot')
plt.xlabel('Index')
plt.ylabel('Value')
plt.show()
# Bar plot
df.iloc[5].plot(kind='bar')
plt.title('Bar Plot')
plt.xlabel('Column')
plt.ylabel('Value')
plt.show()
# Histogram
df['A'].plot(kind='hist', bins=20)
plt.title('Histogram')
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.show()
# Scatter plot
df.plot.scatter(x='A', y='B')
plt.title('Scatter Plot')
plt.xlabel('A')
plt.ylabel('B')
plt.show()
Try experimenting with Pandas in the code playground below!
Code Playground
Code output will appear here...