Machine Learning with Python

import pandas as pd
df = pd.DataFrame({'Fruit': ['Apple', 'Banana', 'Apple', 'Orange']})
print(df['Fruit'].nunique())

#12. df.isnull()
Returns a DataFrame of boolean values indicating missing values.

import pandas as pd
import numpy as np
df = pd.DataFrame({'A': [1, np.nan], 'B': [np.nan, 'x']})
print(df.isnull())

A      B
0  False   True
1   True  False

#13. df.isnull().sum()
Returns the number of missing values in each column.

import pandas as pd
import numpy as np
df = pd.DataFrame({'A': [1, np.nan, 3, np.nan], 'B': [5, 6, 7, 8]})
print(df.isnull().sum())

A    2
B    0
dtype: int64

#14. df.to_csv()
Writes the DataFrame to a comma-separated values (csv) file.

import pandas as pd
df = pd.DataFrame({'A': [1, 2], 'B': [3, 4]})
csv_output = df.to_csv(index=False)
print(csv_output)

A,B
1,3
2,4

#15. df.copy()
Creates a deep copy of a DataFrame.

import pandas as pd
df1 = pd.DataFrame({'A': [1]})
df2 = df1.copy()
df2.loc[0, 'A'] = 99
print(f"Original df1:\n{df1}")
print(f"Copied df2:\n{df2}")

Original df1:
   A
0  1
Copied df2:
    A
0  99

---
#DataAnalysis #Pandas #Selection #Indexing

Part 2: Pandas - Data Selection & Indexing

#16. df['col']
Selects a single column as a Series.

import pandas as pd
df = pd.DataFrame({'Name': ['Alice', 'Bob'], 'Age': [30, 25]})
print(df['Name'])

0    Alice
1      Bob
Name: Name, dtype: object

#17. df[['col1', 'col2']]
Selects multiple columns as a new DataFrame.

import pandas as pd
df = pd.DataFrame({'Name': ['Alice'], 'Age': [30], 'City': ['New York']})
print(df[['Name', 'City']])

Name       City
0  Alice   New York

#18. df.loc[]
Accesses a group of rows and columns by label(s) or a boolean array.

import pandas as pd
df = pd.DataFrame({'A': [1, 2, 3]}, index=['x', 'y', 'z'])
print(df.loc['y'])

A    2
Name: y, dtype: int64

#19. df.iloc[]
Accesses a group of rows and columns by integer position(s).

import pandas as pd
df = pd.DataFrame({'A': [10, 20, 30]})
print(df.iloc[1])

A    20
Name: 1, dtype: int64

#20. df[df['col'] > value]
Selects rows based on a boolean condition (boolean indexing).

import pandas as pd
df = pd.DataFrame({'Age': [22, 35, 18, 40]})
print(df[df['Age'] > 30])

Age
1   35
3   40

#21. df.set_index()
Sets the DataFrame index using existing columns.

import pandas as pd
df = pd.DataFrame({'Country': ['USA', 'UK'], 'Code': [1, 44]})
df_indexed = df.set_index('Country')
print(df_indexed)

Code
Country      
USA         1
UK         44

#22. df.reset_index()
Resets the index of the DataFrame and uses the default integer index.

import pandas as pd
df = pd.DataFrame({'Code': [1, 44]}, index=['USA', 'UK'])
df_reset = df.reset_index()
print(df_reset)

index  Code
0   USA     1
1    UK    44

#23. df.at[]
Accesses a single value by row/column label pair. Faster than .loc.

❤3

1.77K views17:02

Machine Learning with Python

import pandas as pd
df = pd.DataFrame({'A': [1, 2, 3]}, index=['x', 'y', 'z'])
print(df.at['y', 'A'])

#24. df.iat[]
Accesses a single value by row/column integer position. Faster than .iloc.

import pandas as pd
df = pd.DataFrame({'A': [10, 20, 30]})
print(df.iat[1, 0])

#25. df.sample()
Returns a random sample of items from an axis of object.

import pandas as pd
df = pd.DataFrame({'A': range(10)})
print(df.sample(n=3))

A
8  8
2  2
5  5
(Note: Output rows will be random)

---
#DataAnalysis #Pandas #DataCleaning #Manipulation

Part 3: Pandas - Data Cleaning & Manipulation

#26. df.dropna()
Removes missing values.

import pandas as pd
import numpy as np
df = pd.DataFrame({'A': [1, np.nan, 3]})
print(df.dropna())

A
0  1.0
2  3.0

#27. df.fillna()
Fills missing (NA/NaN) values using a specified method.

import pandas as pd
import numpy as np
df = pd.DataFrame({'A': [1, np.nan, 3]})
print(df.fillna(0))

#28. df.astype()
Casts a pandas object to a specified dtype.

import pandas as pd
df = pd.DataFrame({'A': [1.1, 2.7, 3.5]})
df['A'] = df['A'].astype(int)
print(df)

#29. df.rename()
Alters axes labels.

import pandas as pd
df = pd.DataFrame({'a': [1], 'b': [2]})
df_renamed = df.rename(columns={'a': 'A', 'b': 'B'})
print(df_renamed)

A  B
0  1  2

#30. df.drop()
Drops specified labels from rows or columns.

import pandas as pd
df = pd.DataFrame({'A': [1], 'B': [2], 'C': [3]})
df_dropped = df.drop(columns=['B'])
print(df_dropped)

A  C
0  1  3

#31. pd.to_datetime()
Converts argument to datetime.

import pandas as pd
s = pd.Series(['2023-01-01', '2023-01-02'])
dt_s = pd.to_datetime(s)
print(dt_s)

0   2023-01-01
1   2023-01-02
dtype: datetime64[ns]

#32. df.apply()
Applies a function along an axis of the DataFrame.

import pandas as pd
df = pd.DataFrame({'A': [1, 2, 3]})
df['B'] = df['A'].apply(lambda x: x * 2)
print(df)

#33. df['col'].map()
Maps values of a Series according to an input mapping or function.

import pandas as pd
df = pd.DataFrame({'Gender': ['M', 'F', 'M']})
df['Gender_Full'] = df['Gender'].map({'M': 'Male', 'F': 'Female'})
print(df)

Gender Gender_Full
0      M        Male
1      F      Female
2      M        Male

#34. df.replace()
Replaces values given in to_replace with value.

import pandas as pd
df = pd.DataFrame({'Score': [10, -99, 15, -99]})
df_replaced = df.replace(-99, 0)
print(df_replaced)

#35. df.duplicated()
Returns a boolean Series denoting duplicate rows.

import pandas as pd
df = pd.DataFrame({'A': [1, 2, 1], 'B': ['a', 'b', 'a']})
print(df.duplicated())

0    False
1    False
2     True
dtype: bool

#36. df.drop_duplicates()
Returns a DataFrame with duplicate rows removed.

1.74K views17:02

Machine Learning with Python

import pandas as pd
df = pd.DataFrame({'A': [1, 2, 1], 'B': ['a', 'b', 'a']})
print(df.drop_duplicates())

A  B
0  1  a
1  2  b

#37. df.sort_values()
Sorts by the values along either axis.

import pandas as pd
df = pd.DataFrame({'Age': [25, 22, 30]})
print(df.sort_values(by='Age'))

#38. df.sort_index()
Sorts object by labels (along an axis).

import pandas as pd
df = pd.DataFrame({'A': [1, 2, 3]}, index=[10, 5, 8])
print(df.sort_index())

#39. pd.cut()
Bins values into discrete intervals.

import pandas as pd
ages = pd.Series([22, 35, 58, 8, 42])
age_bins = pd.cut(ages, bins=[0, 18, 35, 60], labels=['Child', 'Adult', 'Senior'])
print(age_bins)

0     Adult
1     Adult
2    Senior
3     Child
4    Senior
dtype: category
Categories (3, object): ['Child' < 'Adult' < 'Senior']

#40. pd.qcut()
Quantile-based discretization function (bins into equal-sized groups).

import pandas as pd
data = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
quartiles = pd.qcut(data, 4, labels=False)
print(quartiles)

0    0
1    0
2    0
3    1
4    1
5    2
6    2
7    3
8    3
9    3
dtype: int64

#41. s.str.contains()
Tests if a pattern or regex is contained within a string of a Series.

import pandas as pd
s = pd.Series(['apple', 'banana', 'apricot'])
print(s[s.str.contains('ap')])

0      apple
2    apricot
dtype: object

#42. s.str.split()
Splits strings around a given separator/delimiter.

import pandas as pd
s = pd.Series(['a_b', 'c_d'])
print(s.str.split('_', expand=True))

0  1
0  a  b
1  c  d

#43. s.str.lower()
Converts strings in the Series to lowercase.

import pandas as pd
s = pd.Series(['HELLO', 'World'])
print(s.str.lower())

0    hello
1    world
dtype: object

#44. s.str.strip()
Removes leading and trailing whitespace.

import pandas as pd
s = pd.Series(['  hello  ', ' world '])
print(s.str.strip())

0    hello
1    world
dtype: object

#45. s.dt.year
Extracts the year from a datetime Series.

import pandas as pd
s = pd.to_datetime(pd.Series(['2023-01-01', '2024-05-10']))
print(s.dt.year)

0    2023
1    2024
dtype: int64

---
#DataAnalysis #Pandas #Grouping #Aggregation

Part 4: Pandas - Grouping & Aggregation

#46. df.groupby()
Groups a DataFrame using a mapper or by a Series of columns.

import pandas as pd
df = pd.DataFrame({'Team': ['A', 'B', 'A', 'B'], 'Points': [10, 8, 12, 6]})
grouped = df.groupby('Team')
print(grouped)

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x...>

#47. groupby.agg()
Aggregates using one or more operations over the specified axis.

import pandas as pd
df = pd.DataFrame({'Team': ['A', 'B', 'A', 'B'], 'Points': [10, 8, 12, 6]})
agg_df = df.groupby('Team').agg(['mean', 'sum'])
print(agg_df)

Points     
        mean  sum
Team             
A         11   22
B          7   14

#48. groupby.size()
Computes group sizes.

❤1

2K views17:02

Machine Learning with Python

import pandas as pd
df = pd.DataFrame({'Team': ['A', 'B', 'A', 'B', 'A']})
print(df.groupby('Team').size())

Team
A    3
B    2
dtype: int64

#49. groupby.count()
Computes the count of non-NA cells for each group.

import pandas as pd
import numpy as np
df = pd.DataFrame({'Team': ['A', 'B', 'A'], 'Score': [1, np.nan, 3]})
print(df.groupby('Team').count())

Score
Team       
A         2
B         0

#50. groupby.mean()
Computes the mean of group values.

import pandas as pd
df = pd.DataFrame({'Team': ['A', 'B', 'A', 'B'], 'Points': [10, 8, 12, 6]})
print(df.groupby('Team').mean())

Points
Team        
A         11
B          7

#51. groupby.sum()
Computes the sum of group values.

import pandas as pd
df = pd.DataFrame({'Team': ['A', 'B', 'A', 'B'], 'Points': [10, 8, 12, 6]})
print(df.groupby('Team').sum())

Points
Team        
A         22
B         14

#52. groupby.min()
Computes the minimum of group values.

import pandas as pd
df = pd.DataFrame({'Team': ['A', 'B', 'A', 'B'], 'Points': [10, 8, 12, 6]})
print(df.groupby('Team').min())

Points
Team        
A         10
B          6

#53. groupby.max()
Computes the maximum of group values.

import pandas as pd
df = pd.DataFrame({'Team': ['A', 'B', 'A', 'B'], 'Points': [10, 8, 12, 6]})
print(df.groupby('Team').max())

Points
Team        
A         12
B          8

#54. df.pivot_table()
Creates a spreadsheet-style pivot table as a DataFrame.

import pandas as pd
df = pd.DataFrame({'A': ['foo', 'foo', 'bar'], 'B': ['one', 'two', 'one'], 'C': [1, 2, 3]})
pivot = df.pivot_table(values='C', index='A', columns='B')
print(pivot)

B    one  two
A            
bar  3.0  NaN
foo  1.0  2.0

#55. pd.crosstab()
Computes a cross-tabulation of two (or more) factors.

import pandas as pd
df = pd.DataFrame({'A': ['foo', 'foo', 'bar'], 'B': ['one', 'two', 'one']})
crosstab = pd.crosstab(df.A, df.B)
print(crosstab)

B    one  two
A            
bar    1    0
foo    1    1

---
#DataAnalysis #Pandas #Merging #Joining

Part 5: Pandas - Merging & Concatenating

#56. pd.merge()
Merges DataFrame or named Series objects with a database-style join.

import pandas as pd
df1 = pd.DataFrame({'key': ['A', 'B'], 'val1': [1, 2]})
df2 = pd.DataFrame({'key': ['A', 'B'], 'val2': [3, 4]})
merged = pd.merge(df1, df2, on='key')
print(merged)

key  val1  val2
0   A     1     3
1   B     2     4

#57. pd.concat()
Concatenates pandas objects along a particular axis.

import pandas as pd
df1 = pd.DataFrame({'A': [1, 2]})
df2 = pd.DataFrame({'A': [3, 4]})
concatenated = pd.concat([df1, df2])
print(concatenated)

#58. df.join()
Joins columns with other DataFrame(s) on index or on a key column.

❤2🎉1

2.13K views17:02

Machine Learning with Python

Forwarded from Data Analytics

pandas Cheat Sheet.pdf

1.6 MB

📕

#pandas Cheat Sheet

👨🏻‍💻 To easily read, inspect, clean, and manipulate data however you want, you need to master pandas!

✏️ To make learning and using pandas easier, this #cheatsheet covers almost all the important features you need for data-driven projects.

✔️ Reading and writing data
✔️ Data inspection
✔️ Data transformation and cleaning
✔️ Grouping and summarizing
✔️ Combining datasets

🌐

#DataScience #DataScience

https://t.iss.one/DataAnalyticsX

🏐

Please open Telegram to view this post

VIEW IN TELEGRAM

❤10👍5🔥2🆒1

3.84K views21:46

Machine Learning with Python

🚀 #Pandas Cheat Sheet for Everyday Data Work

This covers the essential functions we use in day to day work like inspecting data, selecting rows and columns, cleaning, manipulating and doing quick aggregations.

https://t.iss.one/CodeProgrammer

❤️

Please open Telegram to view this post

VIEW IN TELEGRAM

❤12👍7🔥1

5.38K viewsedited 09:20

Machine Learning with Python

Mastering pandas%22.pdf

1.6 MB

🌟

A new and comprehensive book "Mastering pandas"

👨🏻‍💻 If I've worked with messy and error-prone data this time, I don't know how much time and energy I've wasted. Incomplete tables, repetitive records, and unorganized data. Exactly the kind of things that make analysis difficult and frustrate you.

⬅️

And the only way to save yourself is to use pandas! A tool that makes processes 10 times faster.

🏷 This book is a comprehensive and organized guide to pandas, so you can start from scratch and gradually master this library and gain the ability to implement real projects. In this file, you'll learn:

🔹 How to clean and prepare large amounts of data for analysis,

🔹 How to analyze real business data and draw conclusions,

🔹 How to automate repetitive tasks with a few lines of code,

🔹 And improve the speed and accuracy of your analyses significantly.

🌐 #DataScience #DataScience #Pandas #Python

https://t.iss.one/CodeProgrammer

⚡️

Please open Telegram to view this post

VIEW IN TELEGRAM

❤8👍1

4.46K views11:04

About

Blog

Apps

Platform