import pandas as pd
df = pd.DataFrame({'Fruit': ['Apple', 'Banana', 'Apple', 'Orange']})
print(df['Fruit'].nunique())
3
#12.
df.isnull()Returns a DataFrame of boolean values indicating missing values.
import pandas as pd
import numpy as np
df = pd.DataFrame({'A': [1, np.nan], 'B': [np.nan, 'x']})
print(df.isnull())
A B
0 False True
1 True False
#13.
df.isnull().sum()Returns the number of missing values in each column.
import pandas as pd
import numpy as np
df = pd.DataFrame({'A': [1, np.nan, 3, np.nan], 'B': [5, 6, 7, 8]})
print(df.isnull().sum())
A 2
B 0
dtype: int64
#14.
df.to_csv()Writes the DataFrame to a comma-separated values (csv) file.
import pandas as pd
df = pd.DataFrame({'A': [1, 2], 'B': [3, 4]})
csv_output = df.to_csv(index=False)
print(csv_output)
A,B
1,3
2,4
#15.
df.copy()Creates a deep copy of a DataFrame.
import pandas as pd
df1 = pd.DataFrame({'A': [1]})
df2 = df1.copy()
df2.loc[0, 'A'] = 99
print(f"Original df1:\n{df1}")
print(f"Copied df2:\n{df2}")
Original df1:
A
0 1
Copied df2:
A
0 99
---
#DataAnalysis #Pandas #Selection #Indexing
Part 2: Pandas - Data Selection & Indexing
#16.
df['col']Selects a single column as a Series.
import pandas as pd
df = pd.DataFrame({'Name': ['Alice', 'Bob'], 'Age': [30, 25]})
print(df['Name'])
0 Alice
1 Bob
Name: Name, dtype: object
#17.
df[['col1', 'col2']]Selects multiple columns as a new DataFrame.
import pandas as pd
df = pd.DataFrame({'Name': ['Alice'], 'Age': [30], 'City': ['New York']})
print(df[['Name', 'City']])
Name City
0 Alice New York
#18.
df.loc[]Accesses a group of rows and columns by label(s) or a boolean array.
import pandas as pd
df = pd.DataFrame({'A': [1, 2, 3]}, index=['x', 'y', 'z'])
print(df.loc['y'])
A 2
Name: y, dtype: int64
#19.
df.iloc[]Accesses a group of rows and columns by integer position(s).
import pandas as pd
df = pd.DataFrame({'A': [10, 20, 30]})
print(df.iloc[1])
A 20
Name: 1, dtype: int64
#20.
df[df['col'] > value]Selects rows based on a boolean condition (boolean indexing).
import pandas as pd
df = pd.DataFrame({'Age': [22, 35, 18, 40]})
print(df[df['Age'] > 30])
Age
1 35
3 40
#21.
df.set_index()Sets the DataFrame index using existing columns.
import pandas as pd
df = pd.DataFrame({'Country': ['USA', 'UK'], 'Code': [1, 44]})
df_indexed = df.set_index('Country')
print(df_indexed)
Code
Country
USA 1
UK 44
#22.
df.reset_index()Resets the index of the DataFrame and uses the default integer index.
import pandas as pd
df = pd.DataFrame({'Code': [1, 44]}, index=['USA', 'UK'])
df_reset = df.reset_index()
print(df_reset)
index Code
0 USA 1
1 UK 44
#23.
df.at[]Accesses a single value by row/column label pair. Faster than
.loc.❤3
import pandas as pd
df = pd.DataFrame({'A': [1, 2, 3]}, index=['x', 'y', 'z'])
print(df.at['y', 'A'])
2
#24.
df.iat[]Accesses a single value by row/column integer position. Faster than
.iloc.import pandas as pd
df = pd.DataFrame({'A': [10, 20, 30]})
print(df.iat[1, 0])
20
#25.
df.sample()Returns a random sample of items from an axis of object.
import pandas as pd
df = pd.DataFrame({'A': range(10)})
print(df.sample(n=3))
A
8 8
2 2
5 5
(Note: Output rows will be random)
---
#DataAnalysis #Pandas #DataCleaning #Manipulation
Part 3: Pandas - Data Cleaning & Manipulation
#26.
df.dropna()Removes missing values.
import pandas as pd
import numpy as np
df = pd.DataFrame({'A': [1, np.nan, 3]})
print(df.dropna())
A
0 1.0
2 3.0
#27.
df.fillna()Fills missing (NA/NaN) values using a specified method.
import pandas as pd
import numpy as np
df = pd.DataFrame({'A': [1, np.nan, 3]})
print(df.fillna(0))
A
0 1.0
1 0.0
2 3.0
#28.
df.astype()Casts a pandas object to a specified dtype.
import pandas as pd
df = pd.DataFrame({'A': [1.1, 2.7, 3.5]})
df['A'] = df['A'].astype(int)
print(df)
A
0 1
1 2
2 3
#29.
df.rename()Alters axes labels.
import pandas as pd
df = pd.DataFrame({'a': [1], 'b': [2]})
df_renamed = df.rename(columns={'a': 'A', 'b': 'B'})
print(df_renamed)
A B
0 1 2
#30.
df.drop()Drops specified labels from rows or columns.
import pandas as pd
df = pd.DataFrame({'A': [1], 'B': [2], 'C': [3]})
df_dropped = df.drop(columns=['B'])
print(df_dropped)
A C
0 1 3
#31.
pd.to_datetime()Converts argument to datetime.
import pandas as pd
s = pd.Series(['2023-01-01', '2023-01-02'])
dt_s = pd.to_datetime(s)
print(dt_s)
0 2023-01-01
1 2023-01-02
dtype: datetime64[ns]
#32.
df.apply()Applies a function along an axis of the DataFrame.
import pandas as pd
df = pd.DataFrame({'A': [1, 2, 3]})
df['B'] = df['A'].apply(lambda x: x * 2)
print(df)
A B
0 1 2
1 2 4
2 3 6
#33.
df['col'].map()Maps values of a Series according to an input mapping or function.
import pandas as pd
df = pd.DataFrame({'Gender': ['M', 'F', 'M']})
df['Gender_Full'] = df['Gender'].map({'M': 'Male', 'F': 'Female'})
print(df)
Gender Gender_Full
0 M Male
1 F Female
2 M Male
#34.
df.replace()Replaces values given in
to_replace with value.import pandas as pd
df = pd.DataFrame({'Score': [10, -99, 15, -99]})
df_replaced = df.replace(-99, 0)
print(df_replaced)
Score
0 10
1 0
2 15
3 0
#35.
df.duplicated()Returns a boolean Series denoting duplicate rows.
import pandas as pd
df = pd.DataFrame({'A': [1, 2, 1], 'B': ['a', 'b', 'a']})
print(df.duplicated())
0 False
1 False
2 True
dtype: bool
#36.
df.drop_duplicates()Returns a DataFrame with duplicate rows removed.
import pandas as pd
df = pd.DataFrame({'A': [1, 2, 1], 'B': ['a', 'b', 'a']})
print(df.drop_duplicates())
A B
0 1 a
1 2 b
#37.
df.sort_values()Sorts by the values along either axis.
import pandas as pd
df = pd.DataFrame({'Age': [25, 22, 30]})
print(df.sort_values(by='Age'))
Age
1 22
0 25
2 30
#38.
df.sort_index()Sorts object by labels (along an axis).
import pandas as pd
df = pd.DataFrame({'A': [1, 2, 3]}, index=[10, 5, 8])
print(df.sort_index())
A
5 2
8 3
10 1
#39.
pd.cut()Bins values into discrete intervals.
import pandas as pd
ages = pd.Series([22, 35, 58, 8, 42])
age_bins = pd.cut(ages, bins=[0, 18, 35, 60], labels=['Child', 'Adult', 'Senior'])
print(age_bins)
0 Adult
1 Adult
2 Senior
3 Child
4 Senior
dtype: category
Categories (3, object): ['Child' < 'Adult' < 'Senior']
#40.
pd.qcut()Quantile-based discretization function (bins into equal-sized groups).
import pandas as pd
data = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
quartiles = pd.qcut(data, 4, labels=False)
print(quartiles)
0 0
1 0
2 0
3 1
4 1
5 2
6 2
7 3
8 3
9 3
dtype: int64
#41.
s.str.contains()Tests if a pattern or regex is contained within a string of a Series.
import pandas as pd
s = pd.Series(['apple', 'banana', 'apricot'])
print(s[s.str.contains('ap')])
0 apple
2 apricot
dtype: object
#42.
s.str.split()Splits strings around a given separator/delimiter.
import pandas as pd
s = pd.Series(['a_b', 'c_d'])
print(s.str.split('_', expand=True))
0 1
0 a b
1 c d
#43.
s.str.lower()Converts strings in the Series to lowercase.
import pandas as pd
s = pd.Series(['HELLO', 'World'])
print(s.str.lower())
0 hello
1 world
dtype: object
#44.
s.str.strip()Removes leading and trailing whitespace.
import pandas as pd
s = pd.Series([' hello ', ' world '])
print(s.str.strip())
0 hello
1 world
dtype: object
#45.
s.dt.yearExtracts the year from a datetime Series.
import pandas as pd
s = pd.to_datetime(pd.Series(['2023-01-01', '2024-05-10']))
print(s.dt.year)
0 2023
1 2024
dtype: int64
---
#DataAnalysis #Pandas #Grouping #Aggregation
Part 4: Pandas - Grouping & Aggregation
#46.
df.groupby()Groups a DataFrame using a mapper or by a Series of columns.
import pandas as pd
df = pd.DataFrame({'Team': ['A', 'B', 'A', 'B'], 'Points': [10, 8, 12, 6]})
grouped = df.groupby('Team')
print(grouped)
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x...>
#47.
groupby.agg()Aggregates using one or more operations over the specified axis.
import pandas as pd
df = pd.DataFrame({'Team': ['A', 'B', 'A', 'B'], 'Points': [10, 8, 12, 6]})
agg_df = df.groupby('Team').agg(['mean', 'sum'])
print(agg_df)
Points
mean sum
Team
A 11 22
B 7 14
#48.
groupby.size()Computes group sizes.
❤1
import pandas as pd
df = pd.DataFrame({'Team': ['A', 'B', 'A', 'B', 'A']})
print(df.groupby('Team').size())
Team
A 3
B 2
dtype: int64
#49.
groupby.count()Computes the count of non-NA cells for each group.
import pandas as pd
import numpy as np
df = pd.DataFrame({'Team': ['A', 'B', 'A'], 'Score': [1, np.nan, 3]})
print(df.groupby('Team').count())
Score
Team
A 2
B 0
#50.
groupby.mean()Computes the mean of group values.
import pandas as pd
df = pd.DataFrame({'Team': ['A', 'B', 'A', 'B'], 'Points': [10, 8, 12, 6]})
print(df.groupby('Team').mean())
Points
Team
A 11
B 7
#51.
groupby.sum()Computes the sum of group values.
import pandas as pd
df = pd.DataFrame({'Team': ['A', 'B', 'A', 'B'], 'Points': [10, 8, 12, 6]})
print(df.groupby('Team').sum())
Points
Team
A 22
B 14
#52.
groupby.min()Computes the minimum of group values.
import pandas as pd
df = pd.DataFrame({'Team': ['A', 'B', 'A', 'B'], 'Points': [10, 8, 12, 6]})
print(df.groupby('Team').min())
Points
Team
A 10
B 6
#53.
groupby.max()Computes the maximum of group values.
import pandas as pd
df = pd.DataFrame({'Team': ['A', 'B', 'A', 'B'], 'Points': [10, 8, 12, 6]})
print(df.groupby('Team').max())
Points
Team
A 12
B 8
#54.
df.pivot_table()Creates a spreadsheet-style pivot table as a DataFrame.
import pandas as pd
df = pd.DataFrame({'A': ['foo', 'foo', 'bar'], 'B': ['one', 'two', 'one'], 'C': [1, 2, 3]})
pivot = df.pivot_table(values='C', index='A', columns='B')
print(pivot)
B one two
A
bar 3.0 NaN
foo 1.0 2.0
#55.
pd.crosstab()Computes a cross-tabulation of two (or more) factors.
import pandas as pd
df = pd.DataFrame({'A': ['foo', 'foo', 'bar'], 'B': ['one', 'two', 'one']})
crosstab = pd.crosstab(df.A, df.B)
print(crosstab)
B one two
A
bar 1 0
foo 1 1
---
#DataAnalysis #Pandas #Merging #Joining
Part 5: Pandas - Merging & Concatenating
#56.
pd.merge()Merges DataFrame or named Series objects with a database-style join.
import pandas as pd
df1 = pd.DataFrame({'key': ['A', 'B'], 'val1': [1, 2]})
df2 = pd.DataFrame({'key': ['A', 'B'], 'val2': [3, 4]})
merged = pd.merge(df1, df2, on='key')
print(merged)
key val1 val2
0 A 1 3
1 B 2 4
#57.
pd.concat()Concatenates pandas objects along a particular axis.
import pandas as pd
df1 = pd.DataFrame({'A': [1, 2]})
df2 = pd.DataFrame({'A': [3, 4]})
concatenated = pd.concat([df1, df2])
print(concatenated)
A
0 1
1 2
0 3
1 4
#58.
df.join()Joins columns with other DataFrame(s) on index or on a key column.
❤2🎉1
Forwarded from Data Analytics
pandas Cheat Sheet.pdf
1.6 MB
👨🏻💻 To easily read, inspect, clean, and manipulate data however you want, you need to master pandas!
https://t.iss.one/DataAnalyticsX
Please open Telegram to view this post
VIEW IN TELEGRAM
❤10👍5🔥2🆒1
🚀 #Pandas Cheat Sheet for Everyday Data Work
This covers the essential functions we use in day to day work like inspecting data, selecting rows and columns, cleaning, manipulating and doing quick aggregations.
https://t.iss.one/CodeProgrammer❤️
This covers the essential functions we use in day to day work like inspecting data, selecting rows and columns, cleaning, manipulating and doing quick aggregations.
https://t.iss.one/CodeProgrammer
Please open Telegram to view this post
VIEW IN TELEGRAM
❤12👍7🔥1
Mastering pandas%22.pdf
1.6 MB
👨🏻💻 If I've worked with messy and error-prone data this time, I don't know how much time and energy I've wasted. Incomplete tables, repetitive records, and unorganized data. Exactly the kind of things that make analysis difficult and frustrate you.
https://t.iss.one/CodeProgrammer
Please open Telegram to view this post
VIEW IN TELEGRAM
❤8👍1