Cheatography
https://cheatography.com
Basic code to explore/analyse datasets
This is a draft cheat sheet. It is a work in progress and is not finished yet.
Explore and pre-process Data
Exploring Data:
# Get basic information about the DataFrame
df.info()
# Summary statistics for numerical columns
df.describe()
# Number of unique values in each column
df.nunique()
# Count missing values in each column
df.isnull().sum()
# Remove duplicate rows
df = df.drop_duplicates()
# Drop columns with missing values
df = df.dropna(axis=1)
# Fill missing values with a specific value
df['column_name'].fillna(value, inplace=True)
# Drop rows with missing values
df = df.dropna()
# Replace values in a column
df['column_name'].replace({old_value: new_value}, inplace=True)
# Convert data types
df['column_name'] = df['column_name'].astype('new_data_type')
# Rename columns
df.rename(columns={'old_column_name': 'new_column_name'}, inplace=True)
# Filter rows based on a condition
filtered_df = df[df['column_name'] > value]
# Multiple conditions
filtered_df = df[(df['column1'] > value1) & (df['column2'] < value2)]
# Select specific columns
selected_columns_df = df[['column1', 'column2']]
# Sorting the DataFrame
df.sort_values(by='column_name', ascending=False, inplace=True)
# Create a new column based on existing columns
df['new_column'] = df['column1'] + df['column2']
# Apply a function to a column
df['new_column'] = df['existing_column'].apply(lambda x: your_function(x))
|
|
Analyse
# Plotting histograms |
# Box plot |
# Scatter plot |
# line graph |
df['column_name'].hist() |
sns.boxplot(x='column1', y='column2', data=df) |
plt.scatter(df['column1'], df['column2']) plt.xlabel('Column1') plt.ylabel('Column2') plt.title('Scatter Plot') plt.show() |
plt.plot(df['x'], df['y']) plt.title('Sample Line Graph') plt.xlabel('X-axis label') plt.ylabel('Y-axis label') plt.show() |
|
First
# import all needed libraries |
# Load data from a CSV file |
# Display the first few rows of the DataFrame |
# Get basic information about the DataFrame |
# Summary statistics for numerical columns |
import pandas as pd import matplotlib.pyplot as plt import seaborn as sns |
df = pd.read_csv('your_file.csv') |
df.head() |
df.info() |
df.describe() |
|
|
|
|
|