Introduction to Pandas

Pandas is a powerful Python library for data manipulation and analysis. It provides data structures for efficiently storing and manipulating large datasets, as well as tools for reading and writing data in different formats. In this lesson, we'll explore the basics of Pandas and how to use it for data analysis.

What is Pandas?

Pandas is built on top of NumPy and provides two primary data structures:

  • Series: A one-dimensional labeled array capable of holding any data type
  • DataFrame: A two-dimensional labeled data structure with columns of potentially different types

Pandas is particularly well-suited for:

  • Working with tabular data (like spreadsheets or SQL tables)
  • Time series data
  • Any labeled data that needs to be manipulated
  • Statistical data analysis

Installing Pandas

You can install Pandas using pip:

pip install pandas

Or using conda:

conda install pandas

Importing Pandas

The conventional way to import Pandas is:

import pandas as pd

Series

A Series is a one-dimensional labeled array capable of holding any data type. It's similar to a column in a spreadsheet or a single variable in a dataset.

Creating a Series

import pandas as pd
import numpy as np

# From a list
s = pd.Series([1, 3, 5, np.nan, 6, 8])
print(s)
# 0    1.0
# 1    3.0
# 2    5.0
# 3    NaN
# 4    6.0
# 5    8.0
# dtype: float64

# With custom index
s = pd.Series([1, 3, 5, 7], index=['a', 'b', 'c', 'd'])
print(s)
# a    1
# b    3
# c    5
# d    7
# dtype: int64

# From a dictionary
d = {'a': 1, 'b': 2, 'c': 3}
s = pd.Series(d)
print(s)
# a    1
# b    2
# c    3
# dtype: int64

Accessing Series Elements

s = pd.Series([1, 3, 5, 7], index=['a', 'b', 'c', 'd'])

# By index label
print(s['a'])  # 1
print(s[['a', 'c']])
# a    1
# c    5
# dtype: int64

# By position
print(s[0])  # 1
print(s[1:3])
# b    3
# c    5
# dtype: int64

# Boolean indexing
print(s[s > 3])
# c    5
# d    7
# dtype: int64

DataFrame

A DataFrame is a two-dimensional labeled data structure with columns of potentially different types. It's similar to a spreadsheet, SQL table, or a dictionary of Series objects.

Creating a DataFrame

import pandas as pd
import numpy as np

# From a dictionary of lists
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David'],
    'Age': [25, 30, 35, 40],
    'City': ['New York', 'London', 'Paris', 'Tokyo']
}
df = pd.DataFrame(data)
print(df)
#       Name  Age      City
# 0    Alice   25  New York
# 1      Bob   30    London
# 2  Charlie   35     Paris
# 3    David   40     Tokyo

# From a list of dictionaries
data = [
    {'Name': 'Alice', 'Age': 25, 'City': 'New York'},
    {'Name': 'Bob', 'Age': 30, 'City': 'London'},
    {'Name': 'Charlie', 'Age': 35, 'City': 'Paris'}
]
df = pd.DataFrame(data)
print(df)
#       Name  Age      City
# 0    Alice   25  New York
# 1      Bob   30    London
# 2  Charlie   35     Paris

# From a NumPy array
data = np.random.randn(3, 3)
df = pd.DataFrame(data, columns=['A', 'B', 'C'], index=['X', 'Y', 'Z'])
print(df)
#           A         B         C
# X -0.264438 -0.363730  0.292977
# Y  0.144044  1.454274  0.761038
# Z  0.121675  0.443863  0.333674

Accessing DataFrame Elements

data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David'],
    'Age': [25, 30, 35, 40],
    'City': ['New York', 'London', 'Paris', 'Tokyo']
}
df = pd.DataFrame(data)

# Accessing columns
print(df['Name'])
# 0      Alice
# 1        Bob
# 2    Charlie
# 3      David
# Name: Name, dtype: object

print(df[['Name', 'Age']])
#       Name  Age
# 0    Alice   25
# 1      Bob   30
# 2  Charlie   35
# 3    David   40

# Accessing rows
print(df.loc[0])
# Name       Alice
# Age           25
# City    New York
# Name: 0, dtype: object

print(df.loc[1:2])
#       Name  Age    City
# 1      Bob   30  London
# 2  Charlie   35   Paris

# Accessing specific cells
print(df.loc[0, 'Name'])  # Alice
print(df.iloc[0, 0])      # Alice

# Boolean indexing
print(df[df['Age'] > 30])
#       Name  Age   City
# 2  Charlie   35  Paris
# 3    David   40  Tokyo

Reading and Writing Data

Pandas provides functions to read and write data in various formats.

Reading Data

# Reading CSV
df = pd.read_csv('data.csv')

# Reading Excel
df = pd.read_excel('data.xlsx', sheet_name='Sheet1')

# Reading JSON
df = pd.read_json('data.json')

# Reading SQL
import sqlite3
conn = sqlite3.connect('database.db')
df = pd.read_sql('SELECT * FROM table_name', conn)

Writing Data

# Writing to CSV
df.to_csv('output.csv', index=False)

# Writing to Excel
df.to_excel('output.xlsx', sheet_name='Sheet1', index=False)

# Writing to JSON
df.to_json('output.json')

# Writing to SQL
import sqlite3
conn = sqlite3.connect('database.db')
df.to_sql('table_name', conn, if_exists='replace', index=False)

Data Manipulation

Pandas provides many functions for manipulating data.

Basic Information

data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David'],
    'Age': [25, 30, 35, 40],
    'City': ['New York', 'London', 'Paris', 'Tokyo']
}
df = pd.DataFrame(data)

# Basic information about the DataFrame
print(df.shape)  # (4, 3) - (rows, columns)
print(df.dtypes)
# Name    object
# Age      int64
# City    object
# dtype: object

print(df.info())
# 
# RangeIndex: 4 entries, 0 to 3
# Data columns (total 3 columns):
#  #   Column  Non-Null Count  Dtype 
# ---  ------  --------------  ----- 
#  0   Name    4 non-null      object
#  1   Age     4 non-null      int64 
#  2   City    4 non-null      object
# dtypes: int64(1), object(2)
# memory usage: 224.0+ bytes

print(df.describe())
#              Age
# count   4.000000
# mean   32.500000
# std     6.454972
# min    25.000000
# 25%    28.750000
# 50%    32.500000
# 75%    36.250000
# max    40.000000

Handling Missing Data

import numpy as np

data = {
    'Name': ['Alice', 'Bob', 'Charlie', np.nan],
    'Age': [25, np.nan, 35, 40],
    'City': ['New York', 'London', np.nan, 'Tokyo']
}
df = pd.DataFrame(data)
print(df)
#       Name   Age      City
# 0    Alice  25.0  New York
# 1      Bob   NaN    London
# 2  Charlie  35.0       NaN
# 3      NaN  40.0     Tokyo

# Check for missing values
print(df.isnull())
#     Name    Age   City
# 0  False  False  False
# 1  False   True  False
# 2  False  False   True
# 3   True  False  False

print(df.isnull().sum())
# Name    1
# Age     1
# City    1
# dtype: int64

# Drop rows with missing values
print(df.dropna())
#       Name   Age      City
# 0    Alice  25.0  New York

# Drop columns with missing values
print(df.dropna(axis=1))
# Empty DataFrame
# Columns: []
# Index: [0, 1, 2, 3]

# Fill missing values
print(df.fillna({'Name': 'Unknown', 'Age': 0, 'City': 'Unknown'}))
#       Name   Age      City
# 0    Alice  25.0  New York
# 1      Bob   0.0    London
# 2  Charlie  35.0   Unknown
# 3  Unknown  40.0     Tokyo

Filtering and Sorting

data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David'],
    'Age': [25, 30, 35, 40],
    'City': ['New York', 'London', 'Paris', 'Tokyo']
}
df = pd.DataFrame(data)

# Filtering
print(df[df['Age'] > 30])
#       Name  Age   City
# 2  Charlie   35  Paris
# 3    David   40  Tokyo

print(df[(df['Age'] > 25) & (df['Age'] < 40)])
#       Name  Age    City
# 1      Bob   30  London
# 2  Charlie   35   Paris

# Sorting
print(df.sort_values('Age'))
#       Name  Age      City
# 0    Alice   25  New York
# 1      Bob   30    London
# 2  Charlie   35     Paris
# 3    David   40     Tokyo

print(df.sort_values('Name', ascending=False))
#       Name  Age      City
# 3    David   40     Tokyo
# 2  Charlie   35     Paris
# 1      Bob   30    London
# 0    Alice   25  New York

Grouping and Aggregation

data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Alice', 'Bob'],
    'Department': ['HR', 'IT', 'Finance', 'HR', 'Finance', 'IT'],
    'Salary': [50000, 60000, 70000, 55000, 65000, 75000]
}
df = pd.DataFrame(data)
print(df)
#       Name Department  Salary
# 0    Alice         HR   50000
# 1      Bob         IT   60000
# 2  Charlie    Finance   70000
# 3    David         HR   55000
# 4    Alice    Finance   65000
# 5      Bob         IT   75000

# Group by one column
grouped = df.groupby('Department')
print(grouped.mean())
#           Salary
# Department       
# Finance    67500
# HR         52500
# IT         67500

# Group by multiple columns
grouped = df.groupby(['Department', 'Name'])
print(grouped.mean())
#                    Salary
# Department Name          
# Finance    Alice    65000
#            Charlie  70000
# HR         Alice    50000
#            David    55000
# IT         Bob      67500

# Aggregation
print(df.groupby('Department').agg({
    'Salary': ['min', 'max', 'mean', 'sum']
}))
#           Salary                    
#              min    max   mean    sum
# Department                           
# Finance     65000  70000  67500  135000
# HR          50000  55000  52500  105000
# IT          60000  75000  67500  135000

Merging and Joining

# Create two DataFrames
df1 = pd.DataFrame({
    'ID': [1, 2, 3, 4],
    'Name': ['Alice', 'Bob', 'Charlie', 'David']
})
print(df1)
#    ID     Name
# 0   1    Alice
# 1   2      Bob
# 2   3  Charlie
# 3   4    David

df2 = pd.DataFrame({
    'ID': [1, 2, 3, 5],
    'Salary': [50000, 60000, 70000, 80000]
})
print(df2)
#    ID  Salary
# 0   1   50000
# 1   2   60000
# 2   3   70000
# 3   5   80000

# Inner join (default)
merged = pd.merge(df1, df2, on='ID')
print(merged)
#    ID     Name  Salary
# 0   1    Alice   50000
# 1   2      Bob   60000
# 2   3  Charlie   70000

# Left join
merged = pd.merge(df1, df2, on='ID', how='left')
print(merged)
#    ID     Name  Salary
# 0   1    Alice   50000
# 1   2      Bob   60000
# 2   3  Charlie   70000
# 3   4    David     NaN

# Right join
merged = pd.merge(df1, df2, on='ID', how='right')
print(merged)
#    ID     Name  Salary
# 0   1    Alice   50000
# 1   2      Bob   60000
# 2   3  Charlie   70000
# 3   5      NaN   80000

# Outer join
merged = pd.merge(df1, df2, on='ID', how='outer')
print(merged)
#    ID     Name  Salary
# 0   1    Alice   50000
# 1   2      Bob   60000
# 2   3  Charlie   70000
# 3   4    David     NaN
# 4   5      NaN   80000

Data Visualization with Pandas

Pandas integrates with Matplotlib to provide basic plotting capabilities.

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Create a DataFrame with random data
df = pd.DataFrame(np.random.randn(10, 4), columns=['A', 'B', 'C', 'D'])

# Line plot
df.plot()
plt.title('Line Plot')
plt.xlabel('Index')
plt.ylabel('Value')
plt.show()

# Bar plot
df.iloc[5].plot(kind='bar')
plt.title('Bar Plot')
plt.xlabel('Column')
plt.ylabel('Value')
plt.show()

# Histogram
df['A'].plot(kind='hist', bins=20)
plt.title('Histogram')
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.show()

# Scatter plot
df.plot.scatter(x='A', y='B')
plt.title('Scatter Plot')
plt.xlabel('A')
plt.ylabel('B')
plt.show()

Try experimenting with Pandas in the code playground below!

Quick Quiz

Which of the following is NOT a primary data structure in Pandas?

Code Playground

Code output will appear here...