# To start understanding our data, we're using the pandas library in Python.
import pandas as pd  # This library helps us work with and analyze data.

# Now, let's read our dataset and store it in a variable called 'df'.
df = pd.read_csv('path/to/your/dataset.csv')  # Make sure to replace 'path/to/your/dataset.csv' with the actual path to your dataset.

# 1. How big is the data?
# We want to know the number of rows and columns in our dataset.
print("1. Size of the data:", df.shape)
# This line of code prints something like (rows, columns), giving us an idea of how much data we have.

# 2. How does the data look like?
# We randomly pick 5 rows to get a sneak peek at what our data looks like.
print("\n2. Sample of the data:")
print(df.sample(5))
# This shows a small part of our dataset, helping us understand the structure and content.

# 3. What is the data type of cols?
# We check the types of data in each column, like whether it's a number, text, or something else.
print("\n3. Data types of columns:")
print(df.info())
# This gives us insights into how our data is stored, which is crucial for analysis and cleaning.

# 4. Are there any missing values?
# We find out if there are any blanks or missing data points in our dataset.
print("\n4. Missing values:")
print(df.isnull().sum())
# This helps us identify areas where information might be incomplete.

# 5. How does the data look mathematically?
# We get statistical information like averages, minimums, and maximums for our numerical columns.
print("\n5. Mathematical summary of the data:")
print(df.describe())
# This gives us a numerical summary of our dataset, helping us understand the distribution of values.

# 6. Are there duplicate values?
# We check if there are any identical rows in our dataset.
print("\n6. Duplicate values:")
print("Number of duplicates:", df.duplicated().sum())
# Identifying and handling duplicates is important for accurate analysis.

# 7. How is the correlation between cols?
# We explore how different columns are related, especially with respect to the 'Survived' column.
print("\n7. Correlation between columns:")
print("Correlation with 'Survived' column:")
print(df.corr()['Survived'])
# This tells us the relationship between each column and whether a passenger survived in the Titanic dataset.

# Note: In the Titanic dataset, the 'Survived' column indicates whether a passenger survived (1) or not (0) during the tragic event.

Understanding these steps helps us get a good grasp of our data, making it easier to make informed decisions in data analysis or machine learning tasks.

# 8. What is the distribution of categorical variables?
# For non-numeric columns, understanding the distribution can be valuable.
categorical_columns = df.select_dtypes(include=['object']).columns
print("\n8. Distribution of categorical variables:")
for col in categorical_columns:
    print(f"\nDistribution for {col}:")
    print(df[col].value_counts())
# This part explores the distribution of non-numeric (categorical) columns, providing insights into how often different categories appear.

# 9. What are the unique values in each column?
# Knowing the unique values helps in spotting anomalies and understanding the diversity of data.
print("\n9. Unique values in each column:")
for col in df.columns:
    print(f"\nUnique values for {col}:")
    print(df[col].unique())
# Printing unique values helps us understand the diversity of data in each column, revealing potential issues or patterns.

# 10. How to visualize the data?
# Visualization is a powerful tool. Let's create a simple histogram for the 'Age' column.
import matplotlib.pyplot as plt
import seaborn as sns

# Set up the visualization environment
sns.set(style="whitegrid")
plt.figure(figsize=(8, 6))

# Plot a histogram for the 'Age' column
sns.histplot(df['Age'], bins=20, kde=True, color='skyblue')
plt.title('Distribution of Ages')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.show()
# Visualization is crucial. Here, we use a histogram to visualize the distribution of ages in the dataset.

# 11. Exploring relationships between variables:
# We can use scatter plots to visualize relationships between numerical variables.
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Age', y='Fare', data=df, hue='Survived', palette='viridis')
plt.title('Scatter Plot: Age vs. Fare with Survival Information')
plt.xlabel('Age')
plt.ylabel('Fare')
plt.legend(title='Survived', loc='upper right', labels=['Not Survived', 'Survived'])
plt.show()
# Scatter plots help us visualize relationships between numerical variables. This one specifically looks at the relationship between age, fare, and survival status.

Understanding these aspects of your data allows for a more comprehensive exploration, helping to uncover patterns, anomalies, and potential features for analysis or modeling.

# 12. Visualizing the distribution of numerical variables:
# Box plots are useful for understanding the distribution and identifying outliers.
plt.figure(figsize=(12, 8))
sns.boxplot(data=df.select_dtypes(include=['int', 'float']))
plt.title('Box Plot: Distribution of Numerical Variables')
plt.show()
# Box plots are useful for understanding the distribution of numerical variables, providing insights into central tendencies and identifying outliers.

# 13. Grouping data for insights:
# We can group the data by a specific column to gain insights into patterns.
grouped_by_class = df.groupby('Pclass').mean()
print("\n13. Grouping data for insights (mean values by passenger class):")
print(grouped_by_class)
# Grouping data allows us to calculate aggregated values. In this case, we calculate the mean values for each numerical column based on passenger class.

# 14. Exploring survival rate by different categories:
# Visualizing survival rates by different categories helps identify patterns.
plt.figure(figsize=(10, 6))
sns.barplot(x='Sex', y='Survived', data=df, palette='muted')
plt.title('Survival Rate by Gender')
plt.xlabel('Gender')
plt.ylabel('Survival Rate')
plt.show()
# Bar plots are used to visualize survival rates based on different categories. In this example, we explore the survival rate by gender.

# 15. Correlation Heatmap:
# A heatmap visually represents the correlation matrix, making it easier to identify strong correlations.
plt.figure(figsize=(10, 8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', linewidths=.5)
plt.title('Correlation Heatmap')
plt.show()
# A heatmap provides a visual representation of the correlation matrix, making it easier to identify strong correlations between numerical variables.

These additional techniques contribute to a more comprehensive understanding of your data, aiding in the identification of patterns, trends, and potential areas for further investigation or feature engineering.

11 Replies to “Understanding-your-Data-Descriptive-Stats”

  1. Good post! Do you face difficulty in planning an outdoor camping journey? If yes, explore the recent post regarding camping that will help you get a checklist of must-have items for your first camping trip. Also, it provides a zip code finder tool for getting a zip code map of your destination with their surrounding area. Make sure to read this guide!

Leave a Reply

Your email address will not be published. Required fields are marked *