Forwarded from Python Data Science Jobs & Interviews
In Python, NumPy is the cornerstone of scientific computing, offering high-performance multidimensional arrays and tools for working with them—critical for data science interviews and real-world applications! 📊
By: @DataScienceQ 🚀
#Python #NumPy #DataScience #CodingInterview #MachineLearning #ScientificComputing #DataAnalysis #Programming #TechJobs #DeveloperTips
import numpy as np
# Array Creation - The foundation of NumPy
arr = np.array([1, 2, 3])
zeros = np.zeros((2, 3)) # 2x3 matrix of zeros
ones = np.ones((2, 2), dtype=int) # Integer matrix
arange = np.arange(0, 10, 2) # [0 2 4 6 8]
linspace = np.linspace(0, 1, 5) # [0. 0.25 0.5 0.75 1. ]
print(linspace)
# Array Attributes - Master your data's structure
matrix = np.array([[1, 2, 3], [4, 5, 6]])
print(matrix.shape) # Output: (2, 3)
print(matrix.ndim) # Output: 2
print(matrix.dtype) # Output: int64
print(matrix.size) # Output: 6
# Indexing & Slicing - Precision data access
data = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
print(data[1, 2]) # Output: 6 (row 1, col 2)
print(data[0:2, 1:3]) # Output: [[2 3], [5 6]]
print(data[:, -1]) # Output: [3 6 9] (last column)
# Reshaping Arrays - Transform dimensions effortlessly
flat = np.arange(6)
reshaped = flat.reshape(2, 3)
raveled = reshaped.ravel()
print(reshaped)
# Output: [[0 1 2], [3 4 5]]
print(raveled) # Output: [0 1 2 3 4 5]
# Stacking Arrays - Combine datasets vertically/horizontally
a = np.array([1, 2, 3])
b = np.array([4, 5, 6])
print(np.vstack((a, b))) # Vertical stack
# Output: [[1 2 3], [4 5 6]]
print(np.hstack((a, b))) # Horizontal stack
# Output: [1 2 3 4 5 6]
# Mathematical Operations - Vectorized calculations
x = np.array([1, 2, 3])
y = np.array([4, 5, 6])
print(x + y) # Output: [5 7 9]
print(x * 2) # Output: [2 4 6]
print(np.dot(x, y)) # Output: 32 (1*4 + 2*5 + 3*6)
# Broadcasting Magic - Operate on mismatched shapes
matrix = np.array([[1, 2, 3], [4, 5, 6]])
scalar = 10
print(matrix + scalar)
# Output: [[11 12 13], [14 15 16]]
# Aggregation Functions - Statistical power in one line
values = np.array([1, 5, 3, 9, 7])
print(np.sum(values)) # Output: 25
print(np.mean(values)) # Output: 5.0
print(np.max(values)) # Output: 9
print(np.std(values)) # Output: 2.8284271247461903
# Boolean Masking - Filter data like a pro
temperatures = np.array([18, 25, 12, 30, 22])
hot_days = temperatures > 24
print(temperatures[hot_days]) # Output: [25 30]
# Random Number Generation - Simulate real-world data
print(np.random.rand(2, 2)) # Uniform distribution
print(np.random.randn(3)) # Normal distribution
print(np.random.randint(0, 10, (2, 3))) # Random integers
# Linear Algebra Essentials - Solve equations like a physicist
A = np.array([[3, 1], [1, 2]])
b = np.array([9, 8])
x = np.linalg.solve(A, b)
print(x) # Output: [2. 3.] (Solution to 3x+y=9 and x+2y=8)
# Matrix inverse and determinant
print(np.linalg.inv(A)) # Output: [[ 0.4 -0.2], [-0.2 0.6]]
print(np.linalg.det(A)) # Output: 5.0
# File Operations - Save/load your computational work
data = np.array([[1, 2], [3, 4]])
np.save('array.npy', data)
loaded = np.load('array.npy')
print(np.array_equal(data, loaded)) # Output: True
# Interview Power Move: Vectorization vs Loops
# 10x faster than native Python loops!
def square_sum(n):
arr = np.arange(n)
return np.sum(arr ** 2)
print(square_sum(5)) # Output: 30 (0²+1²+2²+3²+4²)
# Pro Tip: Memory-efficient data processing
# Process 1GB array without loading entire dataset
large_array = np.memmap('large_data.bin', dtype='float32', mode='r', shape=(1000000, 100))
print(large_array[0:5, 0:3]) # Process small slice
By: @DataScienceQ 🚀
#Python #NumPy #DataScience #CodingInterview #MachineLearning #ScientificComputing #DataAnalysis #Programming #TechJobs #DeveloperTips
❤6
Top 100 Data Analysis Commands & Functions
#DataAnalysis #Pandas #DataLoading #Inspection
Part 1: Pandas - Data Loading & Inspection
#1.
Reads a comma-separated values (csv) file into a Pandas DataFrame.
#2.
Returns the first n rows of the DataFrame (default is 5).
#3.
Returns the last n rows of theDataFrame (default is 5).
#4.
Prints a concise summary of a DataFrame, including data types and non-null values.
#5.
Generates descriptive statistics for numerical columns.
#6.
Returns a tuple representing the dimensionality (rows, columns) of the DataFrame.
#7.
Returns the column labels of the DataFrame.
#8.
Returns the data types of each column.
#9.
Returns a Series containing counts of unique values in a column.
#10.
Returns an array of the unique values in a column.
#11.
Returns the number of unique values in a column.
#DataAnalysis #Pandas #DataLoading #Inspection
Part 1: Pandas - Data Loading & Inspection
#1.
pd.read_csv()Reads a comma-separated values (csv) file into a Pandas DataFrame.
import pandas as pd
from io import StringIO
csv_data = "col1,col2,col3\n1,a,True\n2,b,False"
df = pd.read_csv(StringIO(csv_data))
print(df)
col1 col2 col3
0 1 a True
1 2 b False
#2.
df.head()Returns the first n rows of the DataFrame (default is 5).
import pandas as pd
df = pd.DataFrame({'A': range(10), 'B': list('abcdefghij')})
print(df.head(3))
A B
0 0 a
1 1 b
2 2 c
#3.
df.tail()Returns the last n rows of theDataFrame (default is 5).
import pandas as pd
df = pd.DataFrame({'A': range(10), 'B': list('abcdefghij')})
print(df.tail(3))
A B
7 7 h
8 8 i
9 9 j
#4.
df.info()Prints a concise summary of a DataFrame, including data types and non-null values.
import pandas as pd
import numpy as np
df = pd.DataFrame({'A': [1, 2, np.nan], 'B': ['x', 'y', 'z']})
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 2 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 A 2 non-null float64
1 B 3 non-null object
dtypes: float64(1), object(1)
memory usage: 176.0+ bytes
#5.
df.describe()Generates descriptive statistics for numerical columns.
import pandas as pd
df = pd.DataFrame({'A': [1, 2, 3, 4, 5]})
print(df.describe())
A
count 5.000000
mean 3.000000
std 1.581139
min 1.000000
25% 2.000000
50% 3.000000
75% 4.000000
max 5.000000
#6.
df.shapeReturns a tuple representing the dimensionality (rows, columns) of the DataFrame.
import pandas as pd
df = pd.DataFrame({'A': [1, 2], 'B': [3, 4], 'C': [5, 6]})
print(df.shape)
(2, 3)
#7.
df.columnsReturns the column labels of the DataFrame.
import pandas as pd
df = pd.DataFrame({'Name': ['Alice'], 'Age': [30]})
print(df.columns)
Index(['Name', 'Age'], dtype='object')
#8.
df.dtypesReturns the data types of each column.
import pandas as pd
df = pd.DataFrame({'A': [1, 2], 'B': [1.1, 2.2], 'C': ['x', 'y']})
print(df.dtypes)
A int64
B float64
C object
dtype: object
#9.
df['col'].value_counts()Returns a Series containing counts of unique values in a column.
import pandas as pd
df = pd.DataFrame({'Fruit': ['Apple', 'Banana', 'Apple', 'Orange', 'Banana', 'Apple']})
print(df['Fruit'].value_counts())
Apple 3
Banana 2
Orange 1
Name: Fruit, dtype: int64
#10.
df['col'].unique()Returns an array of the unique values in a column.
import pandas as pd
df = pd.DataFrame({'Fruit': ['Apple', 'Banana', 'Apple', 'Orange']})
print(df['Fruit'].unique())
['Apple' 'Banana' 'Orange']
#11.
df['col'].nunique()Returns the number of unique values in a column.
❤2
import pandas as pd
df = pd.DataFrame({'Fruit': ['Apple', 'Banana', 'Apple', 'Orange']})
print(df['Fruit'].nunique())
3
#12.
df.isnull()Returns a DataFrame of boolean values indicating missing values.
import pandas as pd
import numpy as np
df = pd.DataFrame({'A': [1, np.nan], 'B': [np.nan, 'x']})
print(df.isnull())
A B
0 False True
1 True False
#13.
df.isnull().sum()Returns the number of missing values in each column.
import pandas as pd
import numpy as np
df = pd.DataFrame({'A': [1, np.nan, 3, np.nan], 'B': [5, 6, 7, 8]})
print(df.isnull().sum())
A 2
B 0
dtype: int64
#14.
df.to_csv()Writes the DataFrame to a comma-separated values (csv) file.
import pandas as pd
df = pd.DataFrame({'A': [1, 2], 'B': [3, 4]})
csv_output = df.to_csv(index=False)
print(csv_output)
A,B
1,3
2,4
#15.
df.copy()Creates a deep copy of a DataFrame.
import pandas as pd
df1 = pd.DataFrame({'A': [1]})
df2 = df1.copy()
df2.loc[0, 'A'] = 99
print(f"Original df1:\n{df1}")
print(f"Copied df2:\n{df2}")
Original df1:
A
0 1
Copied df2:
A
0 99
---
#DataAnalysis #Pandas #Selection #Indexing
Part 2: Pandas - Data Selection & Indexing
#16.
df['col']Selects a single column as a Series.
import pandas as pd
df = pd.DataFrame({'Name': ['Alice', 'Bob'], 'Age': [30, 25]})
print(df['Name'])
0 Alice
1 Bob
Name: Name, dtype: object
#17.
df[['col1', 'col2']]Selects multiple columns as a new DataFrame.
import pandas as pd
df = pd.DataFrame({'Name': ['Alice'], 'Age': [30], 'City': ['New York']})
print(df[['Name', 'City']])
Name City
0 Alice New York
#18.
df.loc[]Accesses a group of rows and columns by label(s) or a boolean array.
import pandas as pd
df = pd.DataFrame({'A': [1, 2, 3]}, index=['x', 'y', 'z'])
print(df.loc['y'])
A 2
Name: y, dtype: int64
#19.
df.iloc[]Accesses a group of rows and columns by integer position(s).
import pandas as pd
df = pd.DataFrame({'A': [10, 20, 30]})
print(df.iloc[1])
A 20
Name: 1, dtype: int64
#20.
df[df['col'] > value]Selects rows based on a boolean condition (boolean indexing).
import pandas as pd
df = pd.DataFrame({'Age': [22, 35, 18, 40]})
print(df[df['Age'] > 30])
Age
1 35
3 40
#21.
df.set_index()Sets the DataFrame index using existing columns.
import pandas as pd
df = pd.DataFrame({'Country': ['USA', 'UK'], 'Code': [1, 44]})
df_indexed = df.set_index('Country')
print(df_indexed)
Code
Country
USA 1
UK 44
#22.
df.reset_index()Resets the index of the DataFrame and uses the default integer index.
import pandas as pd
df = pd.DataFrame({'Code': [1, 44]}, index=['USA', 'UK'])
df_reset = df.reset_index()
print(df_reset)
index Code
0 USA 1
1 UK 44
#23.
df.at[]Accesses a single value by row/column label pair. Faster than
.loc.❤1
import pandas as pd
df = pd.DataFrame({'A': [1, 2, 3]}, index=['x', 'y', 'z'])
print(df.at['y', 'A'])
2
#24.
df.iat[]Accesses a single value by row/column integer position. Faster than
.iloc.import pandas as pd
df = pd.DataFrame({'A': [10, 20, 30]})
print(df.iat[1, 0])
20
#25.
df.sample()Returns a random sample of items from an axis of object.
import pandas as pd
df = pd.DataFrame({'A': range(10)})
print(df.sample(n=3))
A
8 8
2 2
5 5
(Note: Output rows will be random)
---
#DataAnalysis #Pandas #DataCleaning #Manipulation
Part 3: Pandas - Data Cleaning & Manipulation
#26.
df.dropna()Removes missing values.
import pandas as pd
import numpy as np
df = pd.DataFrame({'A': [1, np.nan, 3]})
print(df.dropna())
A
0 1.0
2 3.0
#27.
df.fillna()Fills missing (NA/NaN) values using a specified method.
import pandas as pd
import numpy as np
df = pd.DataFrame({'A': [1, np.nan, 3]})
print(df.fillna(0))
A
0 1.0
1 0.0
2 3.0
#28.
df.astype()Casts a pandas object to a specified dtype.
import pandas as pd
df = pd.DataFrame({'A': [1.1, 2.7, 3.5]})
df['A'] = df['A'].astype(int)
print(df)
A
0 1
1 2
2 3
#29.
df.rename()Alters axes labels.
import pandas as pd
df = pd.DataFrame({'a': [1], 'b': [2]})
df_renamed = df.rename(columns={'a': 'A', 'b': 'B'})
print(df_renamed)
A B
0 1 2
#30.
df.drop()Drops specified labels from rows or columns.
import pandas as pd
df = pd.DataFrame({'A': [1], 'B': [2], 'C': [3]})
df_dropped = df.drop(columns=['B'])
print(df_dropped)
A C
0 1 3
#31.
pd.to_datetime()Converts argument to datetime.
import pandas as pd
s = pd.Series(['2023-01-01', '2023-01-02'])
dt_s = pd.to_datetime(s)
print(dt_s)
0 2023-01-01
1 2023-01-02
dtype: datetime64[ns]
#32.
df.apply()Applies a function along an axis of the DataFrame.
import pandas as pd
df = pd.DataFrame({'A': [1, 2, 3]})
df['B'] = df['A'].apply(lambda x: x * 2)
print(df)
A B
0 1 2
1 2 4
2 3 6
#33.
df['col'].map()Maps values of a Series according to an input mapping or function.
import pandas as pd
df = pd.DataFrame({'Gender': ['M', 'F', 'M']})
df['Gender_Full'] = df['Gender'].map({'M': 'Male', 'F': 'Female'})
print(df)
Gender Gender_Full
0 M Male
1 F Female
2 M Male
#34.
df.replace()Replaces values given in
to_replace with value.import pandas as pd
df = pd.DataFrame({'Score': [10, -99, 15, -99]})
df_replaced = df.replace(-99, 0)
print(df_replaced)
Score
0 10
1 0
2 15
3 0
#35.
df.duplicated()Returns a boolean Series denoting duplicate rows.
import pandas as pd
df = pd.DataFrame({'A': [1, 2, 1], 'B': ['a', 'b', 'a']})
print(df.duplicated())
0 False
1 False
2 True
dtype: bool
#36.
df.drop_duplicates()Returns a DataFrame with duplicate rows removed.
import pandas as pd
df = pd.DataFrame({'A': [1, 2, 1], 'B': ['a', 'b', 'a']})
print(df.drop_duplicates())
A B
0 1 a
1 2 b
#37.
df.sort_values()Sorts by the values along either axis.
import pandas as pd
df = pd.DataFrame({'Age': [25, 22, 30]})
print(df.sort_values(by='Age'))
Age
1 22
0 25
2 30
#38.
df.sort_index()Sorts object by labels (along an axis).
import pandas as pd
df = pd.DataFrame({'A': [1, 2, 3]}, index=[10, 5, 8])
print(df.sort_index())
A
5 2
8 3
10 1
#39.
pd.cut()Bins values into discrete intervals.
import pandas as pd
ages = pd.Series([22, 35, 58, 8, 42])
age_bins = pd.cut(ages, bins=[0, 18, 35, 60], labels=['Child', 'Adult', 'Senior'])
print(age_bins)
0 Adult
1 Adult
2 Senior
3 Child
4 Senior
dtype: category
Categories (3, object): ['Child' < 'Adult' < 'Senior']
#40.
pd.qcut()Quantile-based discretization function (bins into equal-sized groups).
import pandas as pd
data = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
quartiles = pd.qcut(data, 4, labels=False)
print(quartiles)
0 0
1 0
2 0
3 1
4 1
5 2
6 2
7 3
8 3
9 3
dtype: int64
#41.
s.str.contains()Tests if a pattern or regex is contained within a string of a Series.
import pandas as pd
s = pd.Series(['apple', 'banana', 'apricot'])
print(s[s.str.contains('ap')])
0 apple
2 apricot
dtype: object
#42.
s.str.split()Splits strings around a given separator/delimiter.
import pandas as pd
s = pd.Series(['a_b', 'c_d'])
print(s.str.split('_', expand=True))
0 1
0 a b
1 c d
#43.
s.str.lower()Converts strings in the Series to lowercase.
import pandas as pd
s = pd.Series(['HELLO', 'World'])
print(s.str.lower())
0 hello
1 world
dtype: object
#44.
s.str.strip()Removes leading and trailing whitespace.
import pandas as pd
s = pd.Series([' hello ', ' world '])
print(s.str.strip())
0 hello
1 world
dtype: object
#45.
s.dt.yearExtracts the year from a datetime Series.
import pandas as pd
s = pd.to_datetime(pd.Series(['2023-01-01', '2024-05-10']))
print(s.dt.year)
0 2023
1 2024
dtype: int64
---
#DataAnalysis #Pandas #Grouping #Aggregation
Part 4: Pandas - Grouping & Aggregation
#46.
df.groupby()Groups a DataFrame using a mapper or by a Series of columns.
import pandas as pd
df = pd.DataFrame({'Team': ['A', 'B', 'A', 'B'], 'Points': [10, 8, 12, 6]})
grouped = df.groupby('Team')
print(grouped)
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x...>
#47.
groupby.agg()Aggregates using one or more operations over the specified axis.
import pandas as pd
df = pd.DataFrame({'Team': ['A', 'B', 'A', 'B'], 'Points': [10, 8, 12, 6]})
agg_df = df.groupby('Team').agg(['mean', 'sum'])
print(agg_df)
Points
mean sum
Team
A 11 22
B 7 14
#48.
groupby.size()Computes group sizes.
❤1
import pandas as pd
df = pd.DataFrame({'Team': ['A', 'B', 'A', 'B', 'A']})
print(df.groupby('Team').size())
Team
A 3
B 2
dtype: int64
#49.
groupby.count()Computes the count of non-NA cells for each group.
import pandas as pd
import numpy as np
df = pd.DataFrame({'Team': ['A', 'B', 'A'], 'Score': [1, np.nan, 3]})
print(df.groupby('Team').count())
Score
Team
A 2
B 0
#50.
groupby.mean()Computes the mean of group values.
import pandas as pd
df = pd.DataFrame({'Team': ['A', 'B', 'A', 'B'], 'Points': [10, 8, 12, 6]})
print(df.groupby('Team').mean())
Points
Team
A 11
B 7
#51.
groupby.sum()Computes the sum of group values.
import pandas as pd
df = pd.DataFrame({'Team': ['A', 'B', 'A', 'B'], 'Points': [10, 8, 12, 6]})
print(df.groupby('Team').sum())
Points
Team
A 22
B 14
#52.
groupby.min()Computes the minimum of group values.
import pandas as pd
df = pd.DataFrame({'Team': ['A', 'B', 'A', 'B'], 'Points': [10, 8, 12, 6]})
print(df.groupby('Team').min())
Points
Team
A 10
B 6
#53.
groupby.max()Computes the maximum of group values.
import pandas as pd
df = pd.DataFrame({'Team': ['A', 'B', 'A', 'B'], 'Points': [10, 8, 12, 6]})
print(df.groupby('Team').max())
Points
Team
A 12
B 8
#54.
df.pivot_table()Creates a spreadsheet-style pivot table as a DataFrame.
import pandas as pd
df = pd.DataFrame({'A': ['foo', 'foo', 'bar'], 'B': ['one', 'two', 'one'], 'C': [1, 2, 3]})
pivot = df.pivot_table(values='C', index='A', columns='B')
print(pivot)
B one two
A
bar 3.0 NaN
foo 1.0 2.0
#55.
pd.crosstab()Computes a cross-tabulation of two (or more) factors.
import pandas as pd
df = pd.DataFrame({'A': ['foo', 'foo', 'bar'], 'B': ['one', 'two', 'one']})
crosstab = pd.crosstab(df.A, df.B)
print(crosstab)
B one two
A
bar 1 0
foo 1 1
---
#DataAnalysis #Pandas #Merging #Joining
Part 5: Pandas - Merging & Concatenating
#56.
pd.merge()Merges DataFrame or named Series objects with a database-style join.
import pandas as pd
df1 = pd.DataFrame({'key': ['A', 'B'], 'val1': [1, 2]})
df2 = pd.DataFrame({'key': ['A', 'B'], 'val2': [3, 4]})
merged = pd.merge(df1, df2, on='key')
print(merged)
key val1 val2
0 A 1 3
1 B 2 4
#57.
pd.concat()Concatenates pandas objects along a particular axis.
import pandas as pd
df1 = pd.DataFrame({'A': [1, 2]})
df2 = pd.DataFrame({'A': [3, 4]})
concatenated = pd.concat([df1, df2])
print(concatenated)
A
0 1
1 2
0 3
1 4
#58.
df.join()Joins columns with other DataFrame(s) on index or on a key column.
❤1
import pandas as pd
df1 = pd.DataFrame({'val1': [1, 2]}, index=['A', 'B'])
df2 = pd.DataFrame({'val2': [3, 4]}, index=['A', 'B'])
joined = df1.join(df2)
print(joined)
val1 val2
A 1 3
B 2 4
#59.
pd.get_dummies()Converts categorical variable into dummy/indicator variables (one-hot encoding).
import pandas as pd
s = pd.Series(list('abca'))
dummies = pd.get_dummies(s)
print(dummies)
a b c
0 1 0 0
1 0 1 0
2 0 0 1
3 1 0 0
#60.
df.nlargest()Returns the first n rows ordered by columns in descending order.
import pandas as pd
df = pd.DataFrame({'population': [100, 500, 200, 800]})
print(df.nlargest(2, 'population'))
population
3 800
1 500
---
#DataAnalysis #NumPy #Arrays
Part 6: NumPy - Array Creation & Manipulation
#61.
np.array()Creates a NumPy ndarray.
import numpy as np
arr = np.array([1, 2, 3])
print(arr)
[1 2 3]
#62.
np.arange()Returns an array with evenly spaced values within a given interval.
import numpy as np
arr = np.arange(0, 5)
print(arr)
[0 1 2 3 4]
#63.
np.linspace()Returns an array with evenly spaced numbers over a specified interval.
import numpy as np
arr = np.linspace(0, 10, 5)
print(arr)
[ 0. 2.5 5. 7.5 10. ]
#64.
np.zeros()Returns a new array of a given shape and type, filled with zeros.
import numpy as np
arr = np.zeros((2, 3))
print(arr)
[[0. 0. 0.]
[0. 0. 0.]]
#65.
np.ones()Returns a new array of a given shape and type, filled with ones.
import numpy as np
arr = np.ones((2, 3))
print(arr)
[[1. 1. 1.]
[1. 1. 1.]]
#66.
np.random.rand()Creates an array of the given shape and populates it with random samples from a uniform distribution over [0, 1).
import numpy as np
arr = np.random.rand(2, 2)
print(arr)
[[0.13949386 0.2921446 ]
[0.52273283 0.77122228]]
(Note: Output values will be random)
#67.
arr.reshape()Gives a new shape to an array without changing its data.
import numpy as np
arr = np.arange(6)
reshaped_arr = arr.reshape((2, 3))
print(reshaped_arr)
[[0 1 2]
[3 4 5]]
#68.
np.concatenate()Joins a sequence of arrays along an existing axis.
import numpy as np
a = np.array([[1, 2]])
b = np.array([[3, 4]])
print(np.concatenate((a, b), axis=0))
[[1 2]
[3 4]]
#69.
np.vstack()Stacks arrays in sequence vertically (row wise).
import numpy as np
a = np.array([1, 2])
b = np.array([3, 4])
print(np.vstack((a, b)))
[[1 2]
[3 4]]
#70.
np.hstack()Stacks arrays in sequence horizontally (column wise).
import numpy as np
a = np.array([1, 2])
b = np.array([3, 4])
print(np.hstack((a, b)))
[1 2 3 4]
---
#DataAnalysis #NumPy #Math #Statistics
Part 7: NumPy - Mathematical & Statistical Functions
#71.
np.mean()Computes the arithmetic mean along the specified axis.
import numpy as np
arr = np.array([1, 2, 3, 4, 5])
print(np.mean(arr))
3.0
#72.
np.median()Computes the median along the specified axis.
import numpy as np
arr = np.array([1, 2, 3, 4, 5])
print(np.median(arr))
3.0
#73.
np.std()Computes the standard deviation along the specified axis.
import numpy as np
arr = np.array([1, 2, 3, 4, 5])
print(np.std(arr))
1.4142135623730951
#74.
np.sum()Sums array elements over a given axis.
import numpy as np
arr = np.array([[1, 2], [3, 4]])
print(np.sum(arr))
10
#75.
np.min()Returns the minimum of an array or minimum along an axis.
import numpy as np
arr = np.array([5, 2, 8, 1])
print(np.min(arr))
1
#76.
np.max()Returns the maximum of an array or maximum along an axis.
import numpy as np
arr = np.array([5, 2, 8, 1])
print(np.max(arr))
8
#77.
np.sqrt()Returns the non-negative square-root of an array, element-wise.
import numpy as np
arr = np.array([4, 9, 16])
print(np.sqrt(arr))
[2. 3. 4.]
#78.
np.log()Calculates the natural logarithm, element-wise.
import numpy as np
arr = np.array([1, np.e, np.e**2])
print(np.log(arr))
[0. 1. 2.]
#79.
np.dot()Calculates the dot product of two arrays.
import numpy as np
a = np.array([1, 2])
b = np.array([3, 4])
print(np.dot(a, b))
11
#80.
np.where()Returns elements chosen from x or y depending on a condition.
import numpy as np
arr = np.array([10, 5, 20, 15])
print(np.where(arr > 12, 'High', 'Low'))
['Low' 'Low' 'High' 'High']
---
#DataAnalysis #Matplotlib #Seaborn #Visualization
Part 8: Matplotlib & Seaborn - Data Visualization
#81.
plt.plot()Plots y versus x as lines and/or markers.
import matplotlib.pyplot as plt
plt.plot([1, 2, 3, 4], [1, 4, 9, 16])
# In a real script, you would call plt.show()
print("Output: A figure window opens displaying a line plot.")
Output: A figure window opens displaying a line plot.
#82.
plt.scatter()A scatter plot of y vs. x with varying marker size and/or color.
import matplotlib.pyplot as plt
plt.scatter([1, 2, 3, 4], [1, 4, 9, 16])
print("Output: A figure window opens displaying a scatter plot.")
Output: A figure window opens displaying a scatter plot.
#83.
plt.hist()Computes and draws the histogram of x.
import matplotlib.pyplot as plt
import numpy as np
data = np.random.randn(1000)
plt.hist(data, bins=30)
print("Output: A figure window opens displaying a histogram.")
Output: A figure window opens displaying a histogram.
#84.
plt.bar()Makes a bar plot.
import matplotlib.pyplot as plt
plt.bar(['A', 'B', 'C'], [10, 15, 7])
print("Output: A figure window opens displaying a bar chart.")
Output: A figure window opens displaying a bar chart.
#85.
plt.boxplot()Makes a box and whisker plot.
import matplotlib.pyplot as plt
import numpy as np
data = [np.random.normal(0, std, 100) for std in range(1, 4)]
plt.boxplot(data)
print("Output: A figure window opens displaying a box plot.")
Output: A figure window opens displaying a box plot.
#86.
sns.heatmap()Plots rectangular data as a color-encoded matrix.
import seaborn as sns
import numpy as np
data = np.random.rand(10, 12)
sns.heatmap(data)
print("Output: A figure window opens displaying a heatmap.")
Output: A figure window opens displaying a heatmap.
#87.
sns.pairplot()Plots pairwise relationships in a dataset.
❤1
import seaborn as sns
import pandas as pd
df = pd.DataFrame(np.random.randn(100, 4), columns=['A', 'B', 'C', 'D'])
# sns.pairplot(df) # This line would generate the plot
print("Output: A figure grid opens showing scatterplots for each pair of variables.")
Output: A figure grid opens showing scatterplots for each pair of variables.
#88.
sns.countplot()Shows the counts of observations in each categorical bin using bars.
import seaborn as sns
import pandas as pd
df = pd.DataFrame({'category': ['A', 'B', 'A', 'C', 'A', 'B']})
sns.countplot(x='category', data=df)
print("Output: A figure window opens showing a count plot.")
Output: A figure window opens showing a count plot.
#89.
sns.jointplot()Draws a plot of two variables with bivariate and univariate graphs.
import seaborn as sns
import pandas as pd
df = pd.DataFrame({'x': range(50), 'y': range(50) + np.random.randn(50)})
# sns.jointplot(x='x', y='y', data=df) # This line would generate the plot
print("Output: A figure shows a scatter plot with histograms for each axis.")
Output: A figure shows a scatter plot with histograms for each axis.
#90.
plt.show()Displays all open figures.
import matplotlib.pyplot as plt
plt.plot([1, 2, 3])
# plt.show() # In a script, this is essential to see the plot.
print("Executes the command to render and display the plot.")
Executes the command to render and display the plot.
---
#DataAnalysis #ScikitLearn #Modeling #Preprocessing
Part 9: Scikit-learn - Modeling & Preprocessing
#91.
train_test_split()Splits arrays or matrices into random train and test subsets.
from sklearn.model_selection import train_test_split
import numpy as np
X, y = np.arange(10).reshape((5, 2)), range(5)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
X_train shape: (3, 2)
X_test shape: (2, 2)
#92.
StandardScaler()Standardizes features by removing the mean and scaling to unit variance.
from sklearn.preprocessing import StandardScaler
data = [[0, 0], [0, 0], [1, 1], [1, 1]]
scaler = StandardScaler()
print(scaler.fit_transform(data))
[[-1. -1.]
[-1. -1.]
[ 1. 1.]
[ 1. 1.]]
#93.
MinMaxScaler()Transforms features by scaling each feature to a given range, typically [0, 1].
from sklearn.preprocessing import MinMaxScaler
data = [[-1, 2], [-0.5, 6], [0, 10], [1, 18]]
scaler = MinMaxScaler()
print(scaler.fit_transform(data))
[[0. 0. ]
[0.25 0.25]
[0.5 0.5 ]
[1. 1. ]]
#94.
LabelEncoder()Encodes target labels with values between 0 and n_classes-1.
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
encoded = le.fit_transform(['paris', 'tokyo', 'paris'])
print(encoded)
[0 1 0]
#95.
OneHotEncoder()Encodes categorical features as a one-hot numeric array.
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder()
X = [['Male'], ['Female'], ['Female']]
print(enc.fit_transform(X).toarray())
[[0. 1.]
[1. 0.]
[1. 0.]]
#96.
LinearRegression()Ordinary least squares Linear Regression model.
from sklearn.linear_model import LinearRegression
X = [[0], [1], [2]]
y = [0, 1, 2]
reg = LinearRegression().fit(X, y)
print(f"Coefficient: {reg.coef_[0]}")
❤1