Python | Machine Learning | Coding | R
67.3K subscribers
1.25K photos
89 videos
153 files
906 links
Help and ads: @hussein_sheikho

Discover powerful insights with Python, Machine Learning, Coding, and R—your essential toolkit for data-driven solutions, smart alg

List of our channels:
https://t.iss.one/addlist/8_rRW2scgfRhOTc0

https://telega.io/?r=nikapsOH
Download Telegram
In Python, handling CSV files is straightforward using the built-in csv module for reading and writing tabular data, or pandas for advanced analysis—essential for data processing tasks like importing/exporting datasets in interviews.

# Reading CSV with csv module (basic)
import csv
with open('data.csv', 'r') as file:
reader = csv.reader(file)
data = list(reader) # data = [['Name', 'Age'], ['Alice', '30'], ['Bob', '25']]

# Writing CSV with csv module
import csv
with open('output.csv', 'w', newline='') as file:
writer = csv.writer(file)
writer.writerow(['Name', 'Age']) # Header
writer.writerows([['Alice', 30], ['Bob', 25]]) # Data rows

# Advanced: Reading with pandas (handles headers, missing values)
import pandas as pd
df = pd.read_csv('data.csv') # df = DataFrame with columns 'Name', 'Age'
print(df.head()) # Output: First 5 rows preview

# Writing with pandas
df.to_csv('output.csv', index=False) # Saves without row indices


#python #csv #pandas #datahandling #fileio #interviewtips

👉 @DataScience4
4👍4
Top 100 Data Analysis Commands & Functions

#DataAnalysis #Pandas #DataLoading #Inspection

Part 1: Pandas - Data Loading & Inspection

#1. pd.read_csv()
Reads a comma-separated values (csv) file into a Pandas DataFrame.

import pandas as pd
from io import StringIO

csv_data = "col1,col2,col3\n1,a,True\n2,b,False"
df = pd.read_csv(StringIO(csv_data))
print(df)

col1 col2   col3
0 1 a True
1 2 b False


#2. df.head()
Returns the first n rows of the DataFrame (default is 5).

import pandas as pd
df = pd.DataFrame({'A': range(10), 'B': list('abcdefghij')})
print(df.head(3))

A  B
0 0 a
1 1 b
2 2 c


#3. df.tail()
Returns the last n rows of theDataFrame (default is 5).

import pandas as pd
df = pd.DataFrame({'A': range(10), 'B': list('abcdefghij')})
print(df.tail(3))

A  B
7 7 h
8 8 i
9 9 j


#4. df.info()
Prints a concise summary of a DataFrame, including data types and non-null values.

import pandas as pd
import numpy as np
df = pd.DataFrame({'A': [1, 2, np.nan], 'B': ['x', 'y', 'z']})
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 2 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 A 2 non-null float64
1 B 3 non-null object
dtypes: float64(1), object(1)
memory usage: 176.0+ bytes


#5. df.describe()
Generates descriptive statistics for numerical columns.

import pandas as pd
df = pd.DataFrame({'A': [1, 2, 3, 4, 5]})
print(df.describe())

A
count 5.000000
mean 3.000000
std 1.581139
min 1.000000
25% 2.000000
50% 3.000000
75% 4.000000
max 5.000000


#6. df.shape
Returns a tuple representing the dimensionality (rows, columns) of the DataFrame.

import pandas as pd
df = pd.DataFrame({'A': [1, 2], 'B': [3, 4], 'C': [5, 6]})
print(df.shape)

(2, 3)


#7. df.columns
Returns the column labels of the DataFrame.

import pandas as pd
df = pd.DataFrame({'Name': ['Alice'], 'Age': [30]})
print(df.columns)

Index(['Name', 'Age'], dtype='object')


#8. df.dtypes
Returns the data types of each column.

import pandas as pd
df = pd.DataFrame({'A': [1, 2], 'B': [1.1, 2.2], 'C': ['x', 'y']})
print(df.dtypes)

A      int64
B float64
C object
dtype: object


#9. df['col'].value_counts()
Returns a Series containing counts of unique values in a column.

import pandas as pd
df = pd.DataFrame({'Fruit': ['Apple', 'Banana', 'Apple', 'Orange', 'Banana', 'Apple']})
print(df['Fruit'].value_counts())

Apple     3
Banana 2
Orange 1
Name: Fruit, dtype: int64


#10. df['col'].unique()
Returns an array of the unique values in a column.

import pandas as pd
df = pd.DataFrame({'Fruit': ['Apple', 'Banana', 'Apple', 'Orange']})
print(df['Fruit'].unique())

['Apple' 'Banana' 'Orange']


#11. df['col'].nunique()
Returns the number of unique values in a column.
2
import pandas as pd
df = pd.DataFrame({'Fruit': ['Apple', 'Banana', 'Apple', 'Orange']})
print(df['Fruit'].nunique())

3


#12. df.isnull()
Returns a DataFrame of boolean values indicating missing values.

import pandas as pd
import numpy as np
df = pd.DataFrame({'A': [1, np.nan], 'B': [np.nan, 'x']})
print(df.isnull())

A      B
0 False True
1 True False


#13. df.isnull().sum()
Returns the number of missing values in each column.

import pandas as pd
import numpy as np
df = pd.DataFrame({'A': [1, np.nan, 3, np.nan], 'B': [5, 6, 7, 8]})
print(df.isnull().sum())

A    2
B 0
dtype: int64


#14. df.to_csv()
Writes the DataFrame to a comma-separated values (csv) file.

import pandas as pd
df = pd.DataFrame({'A': [1, 2], 'B': [3, 4]})
csv_output = df.to_csv(index=False)
print(csv_output)

A,B
1,3
2,4


#15. df.copy()
Creates a deep copy of a DataFrame.

import pandas as pd
df1 = pd.DataFrame({'A': [1]})
df2 = df1.copy()
df2.loc[0, 'A'] = 99
print(f"Original df1:\n{df1}")
print(f"Copied df2:\n{df2}")

Original df1:
A
0 1
Copied df2:
A
0 99

---
#DataAnalysis #Pandas #Selection #Indexing

Part 2: Pandas - Data Selection & Indexing

#16. df['col']
Selects a single column as a Series.

import pandas as pd
df = pd.DataFrame({'Name': ['Alice', 'Bob'], 'Age': [30, 25]})
print(df['Name'])

0    Alice
1 Bob
Name: Name, dtype: object


#17. df[['col1', 'col2']]
Selects multiple columns as a new DataFrame.

import pandas as pd
df = pd.DataFrame({'Name': ['Alice'], 'Age': [30], 'City': ['New York']})
print(df[['Name', 'City']])

Name       City
0 Alice New York


#18. df.loc[]
Accesses a group of rows and columns by label(s) or a boolean array.

import pandas as pd
df = pd.DataFrame({'A': [1, 2, 3]}, index=['x', 'y', 'z'])
print(df.loc['y'])

A    2
Name: y, dtype: int64


#19. df.iloc[]
Accesses a group of rows and columns by integer position(s).

import pandas as pd
df = pd.DataFrame({'A': [10, 20, 30]})
print(df.iloc[1])

A    20
Name: 1, dtype: int64


#20. df[df['col'] > value]
Selects rows based on a boolean condition (boolean indexing).

import pandas as pd
df = pd.DataFrame({'Age': [22, 35, 18, 40]})
print(df[df['Age'] > 30])

Age
1 35
3 40


#21. df.set_index()
Sets the DataFrame index using existing columns.

import pandas as pd
df = pd.DataFrame({'Country': ['USA', 'UK'], 'Code': [1, 44]})
df_indexed = df.set_index('Country')
print(df_indexed)

Code
Country
USA 1
UK 44


#22. df.reset_index()
Resets the index of the DataFrame and uses the default integer index.

import pandas as pd
df = pd.DataFrame({'Code': [1, 44]}, index=['USA', 'UK'])
df_reset = df.reset_index()
print(df_reset)

index  Code
0 USA 1
1 UK 44


#23. df.at[]
Accesses a single value by row/column label pair. Faster than .loc.
1
import pandas as pd
df = pd.DataFrame({'A': [1, 2, 3]}, index=['x', 'y', 'z'])
print(df.at['y', 'A'])

2


#24. df.iat[]
Accesses a single value by row/column integer position. Faster than .iloc.

import pandas as pd
df = pd.DataFrame({'A': [10, 20, 30]})
print(df.iat[1, 0])

20


#25. df.sample()
Returns a random sample of items from an axis of object.

import pandas as pd
df = pd.DataFrame({'A': range(10)})
print(df.sample(n=3))

A
8 8
2 2
5 5
(Note: Output rows will be random)

---
#DataAnalysis #Pandas #DataCleaning #Manipulation

Part 3: Pandas - Data Cleaning & Manipulation

#26. df.dropna()
Removes missing values.

import pandas as pd
import numpy as np
df = pd.DataFrame({'A': [1, np.nan, 3]})
print(df.dropna())

A
0 1.0
2 3.0


#27. df.fillna()
Fills missing (NA/NaN) values using a specified method.

import pandas as pd
import numpy as np
df = pd.DataFrame({'A': [1, np.nan, 3]})
print(df.fillna(0))

A
0 1.0
1 0.0
2 3.0


#28. df.astype()
Casts a pandas object to a specified dtype.

import pandas as pd
df = pd.DataFrame({'A': [1.1, 2.7, 3.5]})
df['A'] = df['A'].astype(int)
print(df)

A
0 1
1 2
2 3


#29. df.rename()
Alters axes labels.

import pandas as pd
df = pd.DataFrame({'a': [1], 'b': [2]})
df_renamed = df.rename(columns={'a': 'A', 'b': 'B'})
print(df_renamed)

A  B
0 1 2


#30. df.drop()
Drops specified labels from rows or columns.

import pandas as pd
df = pd.DataFrame({'A': [1], 'B': [2], 'C': [3]})
df_dropped = df.drop(columns=['B'])
print(df_dropped)

A  C
0 1 3


#31. pd.to_datetime()
Converts argument to datetime.

import pandas as pd
s = pd.Series(['2023-01-01', '2023-01-02'])
dt_s = pd.to_datetime(s)
print(dt_s)

0   2023-01-01
1 2023-01-02
dtype: datetime64[ns]


#32. df.apply()
Applies a function along an axis of the DataFrame.

import pandas as pd
df = pd.DataFrame({'A': [1, 2, 3]})
df['B'] = df['A'].apply(lambda x: x * 2)
print(df)

A  B
0 1 2
1 2 4
2 3 6


#33. df['col'].map()
Maps values of a Series according to an input mapping or function.

import pandas as pd
df = pd.DataFrame({'Gender': ['M', 'F', 'M']})
df['Gender_Full'] = df['Gender'].map({'M': 'Male', 'F': 'Female'})
print(df)

Gender Gender_Full
0 M Male
1 F Female
2 M Male


#34. df.replace()
Replaces values given in to_replace with value.

import pandas as pd
df = pd.DataFrame({'Score': [10, -99, 15, -99]})
df_replaced = df.replace(-99, 0)
print(df_replaced)

Score
0 10
1 0
2 15
3 0


#35. df.duplicated()
Returns a boolean Series denoting duplicate rows.

import pandas as pd
df = pd.DataFrame({'A': [1, 2, 1], 'B': ['a', 'b', 'a']})
print(df.duplicated())

0    False
1 False
2 True
dtype: bool


#36. df.drop_duplicates()
Returns a DataFrame with duplicate rows removed.
import pandas as pd
df = pd.DataFrame({'A': [1, 2, 1], 'B': ['a', 'b', 'a']})
print(df.drop_duplicates())

A  B
0 1 a
1 2 b


#37. df.sort_values()
Sorts by the values along either axis.

import pandas as pd
df = pd.DataFrame({'Age': [25, 22, 30]})
print(df.sort_values(by='Age'))

Age
1 22
0 25
2 30


#38. df.sort_index()
Sorts object by labels (along an axis).

import pandas as pd
df = pd.DataFrame({'A': [1, 2, 3]}, index=[10, 5, 8])
print(df.sort_index())

A
5 2
8 3
10 1


#39. pd.cut()
Bins values into discrete intervals.

import pandas as pd
ages = pd.Series([22, 35, 58, 8, 42])
age_bins = pd.cut(ages, bins=[0, 18, 35, 60], labels=['Child', 'Adult', 'Senior'])
print(age_bins)

0     Adult
1 Adult
2 Senior
3 Child
4 Senior
dtype: category
Categories (3, object): ['Child' < 'Adult' < 'Senior']


#40. pd.qcut()
Quantile-based discretization function (bins into equal-sized groups).

import pandas as pd
data = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
quartiles = pd.qcut(data, 4, labels=False)
print(quartiles)

0    0
1 0
2 0
3 1
4 1
5 2
6 2
7 3
8 3
9 3
dtype: int64


#41. s.str.contains()
Tests if a pattern or regex is contained within a string of a Series.

import pandas as pd
s = pd.Series(['apple', 'banana', 'apricot'])
print(s[s.str.contains('ap')])

0      apple
2 apricot
dtype: object


#42. s.str.split()
Splits strings around a given separator/delimiter.

import pandas as pd
s = pd.Series(['a_b', 'c_d'])
print(s.str.split('_', expand=True))

0  1
0 a b
1 c d


#43. s.str.lower()
Converts strings in the Series to lowercase.

import pandas as pd
s = pd.Series(['HELLO', 'World'])
print(s.str.lower())

0    hello
1 world
dtype: object


#44. s.str.strip()
Removes leading and trailing whitespace.

import pandas as pd
s = pd.Series([' hello ', ' world '])
print(s.str.strip())

0    hello
1 world
dtype: object


#45. s.dt.year
Extracts the year from a datetime Series.

import pandas as pd
s = pd.to_datetime(pd.Series(['2023-01-01', '2024-05-10']))
print(s.dt.year)

0    2023
1 2024
dtype: int64

---
#DataAnalysis #Pandas #Grouping #Aggregation

Part 4: Pandas - Grouping & Aggregation

#46. df.groupby()
Groups a DataFrame using a mapper or by a Series of columns.

import pandas as pd
df = pd.DataFrame({'Team': ['A', 'B', 'A', 'B'], 'Points': [10, 8, 12, 6]})
grouped = df.groupby('Team')
print(grouped)

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x...>


#47. groupby.agg()
Aggregates using one or more operations over the specified axis.

import pandas as pd
df = pd.DataFrame({'Team': ['A', 'B', 'A', 'B'], 'Points': [10, 8, 12, 6]})
agg_df = df.groupby('Team').agg(['mean', 'sum'])
print(agg_df)

Points     
mean sum
Team
A 11 22
B 7 14


#48. groupby.size()
Computes group sizes.
1
import pandas as pd
df = pd.DataFrame({'Team': ['A', 'B', 'A', 'B', 'A']})
print(df.groupby('Team').size())

Team
A 3
B 2
dtype: int64


#49. groupby.count()
Computes the count of non-NA cells for each group.

import pandas as pd
import numpy as np
df = pd.DataFrame({'Team': ['A', 'B', 'A'], 'Score': [1, np.nan, 3]})
print(df.groupby('Team').count())

Score
Team
A 2
B 0


#50. groupby.mean()
Computes the mean of group values.

import pandas as pd
df = pd.DataFrame({'Team': ['A', 'B', 'A', 'B'], 'Points': [10, 8, 12, 6]})
print(df.groupby('Team').mean())

Points
Team
A 11
B 7


#51. groupby.sum()
Computes the sum of group values.

import pandas as pd
df = pd.DataFrame({'Team': ['A', 'B', 'A', 'B'], 'Points': [10, 8, 12, 6]})
print(df.groupby('Team').sum())

Points
Team
A 22
B 14


#52. groupby.min()
Computes the minimum of group values.

import pandas as pd
df = pd.DataFrame({'Team': ['A', 'B', 'A', 'B'], 'Points': [10, 8, 12, 6]})
print(df.groupby('Team').min())

Points
Team
A 10
B 6


#53. groupby.max()
Computes the maximum of group values.

import pandas as pd
df = pd.DataFrame({'Team': ['A', 'B', 'A', 'B'], 'Points': [10, 8, 12, 6]})
print(df.groupby('Team').max())

Points
Team
A 12
B 8


#54. df.pivot_table()
Creates a spreadsheet-style pivot table as a DataFrame.

import pandas as pd
df = pd.DataFrame({'A': ['foo', 'foo', 'bar'], 'B': ['one', 'two', 'one'], 'C': [1, 2, 3]})
pivot = df.pivot_table(values='C', index='A', columns='B')
print(pivot)

B    one  two
A
bar 3.0 NaN
foo 1.0 2.0


#55. pd.crosstab()
Computes a cross-tabulation of two (or more) factors.

import pandas as pd
df = pd.DataFrame({'A': ['foo', 'foo', 'bar'], 'B': ['one', 'two', 'one']})
crosstab = pd.crosstab(df.A, df.B)
print(crosstab)

B    one  two
A
bar 1 0
foo 1 1

---
#DataAnalysis #Pandas #Merging #Joining

Part 5: Pandas - Merging & Concatenating

#56. pd.merge()
Merges DataFrame or named Series objects with a database-style join.

import pandas as pd
df1 = pd.DataFrame({'key': ['A', 'B'], 'val1': [1, 2]})
df2 = pd.DataFrame({'key': ['A', 'B'], 'val2': [3, 4]})
merged = pd.merge(df1, df2, on='key')
print(merged)

key  val1  val2
0 A 1 3
1 B 2 4


#57. pd.concat()
Concatenates pandas objects along a particular axis.

import pandas as pd
df1 = pd.DataFrame({'A': [1, 2]})
df2 = pd.DataFrame({'A': [3, 4]})
concatenated = pd.concat([df1, df2])
print(concatenated)

A
0 1
1 2
0 3
1 4


#58. df.join()
Joins columns with other DataFrame(s) on index or on a key column.
1