mirror of
https://github.com/TheAlgorithms/Python.git
synced 2025-07-04 16:57:32 +08:00
Add pep8-naming to pre-commit hooks and fixes incorrect naming conventions (#7062)
* ci(pre-commit): Add pep8-naming to `pre-commit` hooks (#7038) * refactor: Fix naming conventions (#7038) * Update arithmetic_analysis/lu_decomposition.py Co-authored-by: Christian Clauss <cclauss@me.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * refactor(lu_decomposition): Replace `NDArray` with `ArrayLike` (#7038) * chore: Fix naming conventions in doctests (#7038) * fix: Temporarily disable project euler problem 104 (#7069) * chore: Fix naming conventions in doctests (#7038) Co-authored-by: Christian Clauss <cclauss@me.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
This commit is contained in:
@ -6,7 +6,7 @@ Output: The decision tree maps a real number input to a real number output.
|
||||
import numpy as np
|
||||
|
||||
|
||||
class Decision_Tree:
|
||||
class DecisionTree:
|
||||
def __init__(self, depth=5, min_leaf_size=5):
|
||||
self.depth = depth
|
||||
self.decision_boundary = 0
|
||||
@ -22,17 +22,17 @@ class Decision_Tree:
|
||||
@param prediction: a floating point value
|
||||
return value: mean_squared_error calculates the error if prediction is used to
|
||||
estimate the labels
|
||||
>>> tester = Decision_Tree()
|
||||
>>> tester = DecisionTree()
|
||||
>>> test_labels = np.array([1,2,3,4,5,6,7,8,9,10])
|
||||
>>> test_prediction = np.float(6)
|
||||
>>> tester.mean_squared_error(test_labels, test_prediction) == (
|
||||
... Test_Decision_Tree.helper_mean_squared_error_test(test_labels,
|
||||
... TestDecisionTree.helper_mean_squared_error_test(test_labels,
|
||||
... test_prediction))
|
||||
True
|
||||
>>> test_labels = np.array([1,2,3])
|
||||
>>> test_prediction = np.float(2)
|
||||
>>> tester.mean_squared_error(test_labels, test_prediction) == (
|
||||
... Test_Decision_Tree.helper_mean_squared_error_test(test_labels,
|
||||
... TestDecisionTree.helper_mean_squared_error_test(test_labels,
|
||||
... test_prediction))
|
||||
True
|
||||
"""
|
||||
@ -41,10 +41,10 @@ class Decision_Tree:
|
||||
|
||||
return np.mean((labels - prediction) ** 2)
|
||||
|
||||
def train(self, X, y):
|
||||
def train(self, x, y):
|
||||
"""
|
||||
train:
|
||||
@param X: a one dimensional numpy array
|
||||
@param x: a one dimensional numpy array
|
||||
@param y: a one dimensional numpy array.
|
||||
The contents of y are the labels for the corresponding X values
|
||||
|
||||
@ -55,17 +55,17 @@ class Decision_Tree:
|
||||
this section is to check that the inputs conform to our dimensionality
|
||||
constraints
|
||||
"""
|
||||
if X.ndim != 1:
|
||||
if x.ndim != 1:
|
||||
print("Error: Input data set must be one dimensional")
|
||||
return
|
||||
if len(X) != len(y):
|
||||
if len(x) != len(y):
|
||||
print("Error: X and y have different lengths")
|
||||
return
|
||||
if y.ndim != 1:
|
||||
print("Error: Data set labels must be one dimensional")
|
||||
return
|
||||
|
||||
if len(X) < 2 * self.min_leaf_size:
|
||||
if len(x) < 2 * self.min_leaf_size:
|
||||
self.prediction = np.mean(y)
|
||||
return
|
||||
|
||||
@ -74,7 +74,7 @@ class Decision_Tree:
|
||||
return
|
||||
|
||||
best_split = 0
|
||||
min_error = self.mean_squared_error(X, np.mean(y)) * 2
|
||||
min_error = self.mean_squared_error(x, np.mean(y)) * 2
|
||||
|
||||
"""
|
||||
loop over all possible splits for the decision tree. find the best split.
|
||||
@ -82,34 +82,34 @@ class Decision_Tree:
|
||||
then the data set is not split and the average for the entire array is used as
|
||||
the predictor
|
||||
"""
|
||||
for i in range(len(X)):
|
||||
if len(X[:i]) < self.min_leaf_size:
|
||||
for i in range(len(x)):
|
||||
if len(x[:i]) < self.min_leaf_size:
|
||||
continue
|
||||
elif len(X[i:]) < self.min_leaf_size:
|
||||
elif len(x[i:]) < self.min_leaf_size:
|
||||
continue
|
||||
else:
|
||||
error_left = self.mean_squared_error(X[:i], np.mean(y[:i]))
|
||||
error_right = self.mean_squared_error(X[i:], np.mean(y[i:]))
|
||||
error_left = self.mean_squared_error(x[:i], np.mean(y[:i]))
|
||||
error_right = self.mean_squared_error(x[i:], np.mean(y[i:]))
|
||||
error = error_left + error_right
|
||||
if error < min_error:
|
||||
best_split = i
|
||||
min_error = error
|
||||
|
||||
if best_split != 0:
|
||||
left_X = X[:best_split]
|
||||
left_x = x[:best_split]
|
||||
left_y = y[:best_split]
|
||||
right_X = X[best_split:]
|
||||
right_x = x[best_split:]
|
||||
right_y = y[best_split:]
|
||||
|
||||
self.decision_boundary = X[best_split]
|
||||
self.left = Decision_Tree(
|
||||
self.decision_boundary = x[best_split]
|
||||
self.left = DecisionTree(
|
||||
depth=self.depth - 1, min_leaf_size=self.min_leaf_size
|
||||
)
|
||||
self.right = Decision_Tree(
|
||||
self.right = DecisionTree(
|
||||
depth=self.depth - 1, min_leaf_size=self.min_leaf_size
|
||||
)
|
||||
self.left.train(left_X, left_y)
|
||||
self.right.train(right_X, right_y)
|
||||
self.left.train(left_x, left_y)
|
||||
self.right.train(right_x, right_y)
|
||||
else:
|
||||
self.prediction = np.mean(y)
|
||||
|
||||
@ -134,7 +134,7 @@ class Decision_Tree:
|
||||
return None
|
||||
|
||||
|
||||
class Test_Decision_Tree:
|
||||
class TestDecisionTree:
|
||||
"""Decision Tres test class"""
|
||||
|
||||
@staticmethod
|
||||
@ -159,11 +159,11 @@ def main():
|
||||
predict the label of 10 different test values. Then the mean squared error over
|
||||
this test is displayed.
|
||||
"""
|
||||
X = np.arange(-1.0, 1.0, 0.005)
|
||||
y = np.sin(X)
|
||||
x = np.arange(-1.0, 1.0, 0.005)
|
||||
y = np.sin(x)
|
||||
|
||||
tree = Decision_Tree(depth=10, min_leaf_size=10)
|
||||
tree.train(X, y)
|
||||
tree = DecisionTree(depth=10, min_leaf_size=10)
|
||||
tree.train(x, y)
|
||||
|
||||
test_cases = (np.random.rand(10) * 2) - 1
|
||||
predictions = np.array([tree.predict(x) for x in test_cases])
|
||||
|
@ -17,19 +17,19 @@ def main():
|
||||
iris = load_iris()
|
||||
|
||||
# Split dataset into train and test data
|
||||
X = iris["data"] # features
|
||||
Y = iris["target"]
|
||||
x = iris["data"] # features
|
||||
y = iris["target"]
|
||||
x_train, x_test, y_train, y_test = train_test_split(
|
||||
X, Y, test_size=0.3, random_state=1
|
||||
x, y, test_size=0.3, random_state=1
|
||||
)
|
||||
|
||||
# Gaussian Naive Bayes
|
||||
NB_model = GaussianNB()
|
||||
NB_model.fit(x_train, y_train)
|
||||
nb_model = GaussianNB()
|
||||
nb_model.fit(x_train, y_train)
|
||||
|
||||
# Display Confusion Matrix
|
||||
plot_confusion_matrix(
|
||||
NB_model,
|
||||
nb_model,
|
||||
x_test,
|
||||
y_test,
|
||||
display_labels=iris["target_names"],
|
||||
|
@ -26,25 +26,25 @@ def main():
|
||||
print(df_boston.describe().T)
|
||||
# Feature selection
|
||||
|
||||
X = df_boston.iloc[:, :-1]
|
||||
x = df_boston.iloc[:, :-1]
|
||||
y = df_boston.iloc[:, -1] # target variable
|
||||
# split the data with 75% train and 25% test sets.
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
X, y, random_state=0, test_size=0.25
|
||||
x_train, x_test, y_train, y_test = train_test_split(
|
||||
x, y, random_state=0, test_size=0.25
|
||||
)
|
||||
|
||||
model = GradientBoostingRegressor(
|
||||
n_estimators=500, max_depth=5, min_samples_split=4, learning_rate=0.01
|
||||
)
|
||||
# training the model
|
||||
model.fit(X_train, y_train)
|
||||
model.fit(x_train, y_train)
|
||||
# to see how good the model fit the data
|
||||
training_score = model.score(X_train, y_train).round(3)
|
||||
test_score = model.score(X_test, y_test).round(3)
|
||||
training_score = model.score(x_train, y_train).round(3)
|
||||
test_score = model.score(x_test, y_test).round(3)
|
||||
print("Training score of GradientBoosting is :", training_score)
|
||||
print("The test score of GradientBoosting is :", test_score)
|
||||
# Let us evaluation the model by finding the errors
|
||||
y_pred = model.predict(X_test)
|
||||
y_pred = model.predict(x_test)
|
||||
|
||||
# The mean squared error
|
||||
print(f"Mean squared error: {mean_squared_error(y_test, y_pred):.2f}")
|
||||
|
@ -69,8 +69,8 @@ def get_initial_centroids(data, k, seed=None):
|
||||
return centroids
|
||||
|
||||
|
||||
def centroid_pairwise_dist(X, centroids):
|
||||
return pairwise_distances(X, centroids, metric="euclidean")
|
||||
def centroid_pairwise_dist(x, centroids):
|
||||
return pairwise_distances(x, centroids, metric="euclidean")
|
||||
|
||||
|
||||
def assign_clusters(data, centroids):
|
||||
@ -197,8 +197,8 @@ if False: # change to true to run this test case.
|
||||
plot_heterogeneity(heterogeneity, k)
|
||||
|
||||
|
||||
def ReportGenerator(
|
||||
df: pd.DataFrame, ClusteringVariables: np.ndarray, FillMissingReport=None
|
||||
def report_generator(
|
||||
df: pd.DataFrame, clustering_variables: np.ndarray, fill_missing_report=None
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
Function generates easy-erading clustering report. It takes 2 arguments as an input:
|
||||
@ -214,7 +214,7 @@ def ReportGenerator(
|
||||
>>> data['col2'] = [100, 200, 300]
|
||||
>>> data['col3'] = [10, 20, 30]
|
||||
>>> data['Cluster'] = [1, 1, 2]
|
||||
>>> ReportGenerator(data, ['col1', 'col2'], 0)
|
||||
>>> report_generator(data, ['col1', 'col2'], 0)
|
||||
Features Type Mark 1 2
|
||||
0 # of Customers ClusterSize False 2.000000 1.000000
|
||||
1 % of Customers ClusterProportion False 0.666667 0.333333
|
||||
@ -231,8 +231,8 @@ def ReportGenerator(
|
||||
[104 rows x 5 columns]
|
||||
"""
|
||||
# Fill missing values with given rules
|
||||
if FillMissingReport:
|
||||
df.fillna(value=FillMissingReport, inplace=True)
|
||||
if fill_missing_report:
|
||||
df.fillna(value=fill_missing_report, inplace=True)
|
||||
df["dummy"] = 1
|
||||
numeric_cols = df.select_dtypes(np.number).columns
|
||||
report = (
|
||||
@ -313,7 +313,7 @@ def ReportGenerator(
|
||||
report = pd.concat(
|
||||
[report, a, clustersize, clusterproportion], axis=0
|
||||
) # concat report with clustert size and nan values
|
||||
report["Mark"] = report["Features"].isin(ClusteringVariables)
|
||||
report["Mark"] = report["Features"].isin(clustering_variables)
|
||||
cols = report.columns.tolist()
|
||||
cols = cols[0:2] + cols[-1:] + cols[2:-1]
|
||||
report = report[cols]
|
||||
|
@ -41,11 +41,11 @@ def local_weight(
|
||||
[0.08272556]])
|
||||
"""
|
||||
weight = weighted_matrix(point, training_data_x, bandwidth)
|
||||
W = (training_data_x.T * (weight * training_data_x)).I * (
|
||||
w = (training_data_x.T * (weight * training_data_x)).I * (
|
||||
training_data_x.T * weight * training_data_y.T
|
||||
)
|
||||
|
||||
return W
|
||||
return w
|
||||
|
||||
|
||||
def local_weight_regression(
|
||||
|
@ -35,25 +35,25 @@ def cost_function(h, y):
|
||||
return (-y * np.log(h) - (1 - y) * np.log(1 - h)).mean()
|
||||
|
||||
|
||||
def log_likelihood(X, Y, weights):
|
||||
scores = np.dot(X, weights)
|
||||
return np.sum(Y * scores - np.log(1 + np.exp(scores)))
|
||||
def log_likelihood(x, y, weights):
|
||||
scores = np.dot(x, weights)
|
||||
return np.sum(y * scores - np.log(1 + np.exp(scores)))
|
||||
|
||||
|
||||
# here alpha is the learning rate, X is the feature matrix,y is the target matrix
|
||||
def logistic_reg(alpha, X, y, max_iterations=70000):
|
||||
theta = np.zeros(X.shape[1])
|
||||
def logistic_reg(alpha, x, y, max_iterations=70000):
|
||||
theta = np.zeros(x.shape[1])
|
||||
|
||||
for iterations in range(max_iterations):
|
||||
z = np.dot(X, theta)
|
||||
z = np.dot(x, theta)
|
||||
h = sigmoid_function(z)
|
||||
gradient = np.dot(X.T, h - y) / y.size
|
||||
gradient = np.dot(x.T, h - y) / y.size
|
||||
theta = theta - alpha * gradient # updating the weights
|
||||
z = np.dot(X, theta)
|
||||
z = np.dot(x, theta)
|
||||
h = sigmoid_function(z)
|
||||
J = cost_function(h, y)
|
||||
j = cost_function(h, y)
|
||||
if iterations % 100 == 0:
|
||||
print(f"loss: {J} \t") # printing the loss after every 100 iterations
|
||||
print(f"loss: {j} \t") # printing the loss after every 100 iterations
|
||||
return theta
|
||||
|
||||
|
||||
@ -61,23 +61,23 @@ def logistic_reg(alpha, X, y, max_iterations=70000):
|
||||
|
||||
if __name__ == "__main__":
|
||||
iris = datasets.load_iris()
|
||||
X = iris.data[:, :2]
|
||||
x = iris.data[:, :2]
|
||||
y = (iris.target != 0) * 1
|
||||
|
||||
alpha = 0.1
|
||||
theta = logistic_reg(alpha, X, y, max_iterations=70000)
|
||||
theta = logistic_reg(alpha, x, y, max_iterations=70000)
|
||||
print("theta: ", theta) # printing the theta i.e our weights vector
|
||||
|
||||
def predict_prob(X):
|
||||
def predict_prob(x):
|
||||
return sigmoid_function(
|
||||
np.dot(X, theta)
|
||||
np.dot(x, theta)
|
||||
) # predicting the value of probability from the logistic regression algorithm
|
||||
|
||||
plt.figure(figsize=(10, 6))
|
||||
plt.scatter(X[y == 0][:, 0], X[y == 0][:, 1], color="b", label="0")
|
||||
plt.scatter(X[y == 1][:, 0], X[y == 1][:, 1], color="r", label="1")
|
||||
(x1_min, x1_max) = (X[:, 0].min(), X[:, 0].max())
|
||||
(x2_min, x2_max) = (X[:, 1].min(), X[:, 1].max())
|
||||
plt.scatter(x[y == 0][:, 0], x[y == 0][:, 1], color="b", label="0")
|
||||
plt.scatter(x[y == 1][:, 0], x[y == 1][:, 1], color="r", label="1")
|
||||
(x1_min, x1_max) = (x[:, 0].min(), x[:, 0].max())
|
||||
(x2_min, x2_max) = (x[:, 1].min(), x[:, 1].max())
|
||||
(xx1, xx2) = np.meshgrid(np.linspace(x1_min, x1_max), np.linspace(x2_min, x2_max))
|
||||
grid = np.c_[xx1.ravel(), xx2.ravel()]
|
||||
probs = predict_prob(grid).reshape(xx1.shape)
|
||||
|
@ -15,12 +15,12 @@ test = [[0.0, 0.0], [0.0, 1.0], [1.0, 1.0]]
|
||||
Y = clf.predict(test)
|
||||
|
||||
|
||||
def wrapper(Y):
|
||||
def wrapper(y):
|
||||
"""
|
||||
>>> wrapper(Y)
|
||||
[0, 0, 1]
|
||||
"""
|
||||
return list(Y)
|
||||
return list(y)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
@ -17,10 +17,10 @@ def main():
|
||||
iris = load_iris()
|
||||
|
||||
# Split dataset into train and test data
|
||||
X = iris["data"] # features
|
||||
Y = iris["target"]
|
||||
x = iris["data"] # features
|
||||
y = iris["target"]
|
||||
x_train, x_test, y_train, y_test = train_test_split(
|
||||
X, Y, test_size=0.3, random_state=1
|
||||
x, y, test_size=0.3, random_state=1
|
||||
)
|
||||
|
||||
# Random Forest Classifier
|
||||
|
@ -17,10 +17,10 @@ def main():
|
||||
print(boston.keys())
|
||||
|
||||
# Split dataset into train and test data
|
||||
X = boston["data"] # features
|
||||
Y = boston["target"]
|
||||
x = boston["data"] # features
|
||||
y = boston["target"]
|
||||
x_train, x_test, y_train, y_test = train_test_split(
|
||||
X, Y, test_size=0.3, random_state=1
|
||||
x, y, test_size=0.3, random_state=1
|
||||
)
|
||||
|
||||
# Random Forest Regressor
|
||||
|
@ -80,7 +80,7 @@ class SmoSVM:
|
||||
|
||||
# Calculate alphas using SMO algorithm
|
||||
def fit(self):
|
||||
K = self._k
|
||||
k = self._k
|
||||
state = None
|
||||
while True:
|
||||
|
||||
@ -106,14 +106,14 @@ class SmoSVM:
|
||||
# 3: update threshold(b)
|
||||
b1_new = np.float64(
|
||||
-e1
|
||||
- y1 * K(i1, i1) * (a1_new - a1)
|
||||
- y2 * K(i2, i1) * (a2_new - a2)
|
||||
- y1 * k(i1, i1) * (a1_new - a1)
|
||||
- y2 * k(i2, i1) * (a2_new - a2)
|
||||
+ self._b
|
||||
)
|
||||
b2_new = np.float64(
|
||||
-e2
|
||||
- y2 * K(i2, i2) * (a2_new - a2)
|
||||
- y1 * K(i1, i2) * (a1_new - a1)
|
||||
- y2 * k(i2, i2) * (a2_new - a2)
|
||||
- y1 * k(i1, i2) * (a1_new - a1)
|
||||
+ self._b
|
||||
)
|
||||
if 0.0 < a1_new < self._c:
|
||||
@ -134,8 +134,8 @@ class SmoSVM:
|
||||
if s == i1 or s == i2:
|
||||
continue
|
||||
self._error[s] += (
|
||||
y1 * (a1_new - a1) * K(i1, s)
|
||||
+ y2 * (a2_new - a2) * K(i2, s)
|
||||
y1 * (a1_new - a1) * k(i1, s)
|
||||
+ y2 * (a2_new - a2) * k(i2, s)
|
||||
+ (self._b - b_old)
|
||||
)
|
||||
|
||||
@ -305,56 +305,56 @@ class SmoSVM:
|
||||
|
||||
# Get the new alpha2 and new alpha1
|
||||
def _get_new_alpha(self, i1, i2, a1, a2, e1, e2, y1, y2):
|
||||
K = self._k
|
||||
k = self._k
|
||||
if i1 == i2:
|
||||
return None, None
|
||||
|
||||
# calculate L and H which bound the new alpha2
|
||||
s = y1 * y2
|
||||
if s == -1:
|
||||
L, H = max(0.0, a2 - a1), min(self._c, self._c + a2 - a1)
|
||||
l, h = max(0.0, a2 - a1), min(self._c, self._c + a2 - a1)
|
||||
else:
|
||||
L, H = max(0.0, a2 + a1 - self._c), min(self._c, a2 + a1)
|
||||
if L == H:
|
||||
l, h = max(0.0, a2 + a1 - self._c), min(self._c, a2 + a1)
|
||||
if l == h: # noqa: E741
|
||||
return None, None
|
||||
|
||||
# calculate eta
|
||||
k11 = K(i1, i1)
|
||||
k22 = K(i2, i2)
|
||||
k12 = K(i1, i2)
|
||||
k11 = k(i1, i1)
|
||||
k22 = k(i2, i2)
|
||||
k12 = k(i1, i2)
|
||||
eta = k11 + k22 - 2.0 * k12
|
||||
|
||||
# select the new alpha2 which could get the minimal objectives
|
||||
if eta > 0.0:
|
||||
a2_new_unc = a2 + (y2 * (e1 - e2)) / eta
|
||||
# a2_new has a boundary
|
||||
if a2_new_unc >= H:
|
||||
a2_new = H
|
||||
elif a2_new_unc <= L:
|
||||
a2_new = L
|
||||
if a2_new_unc >= h:
|
||||
a2_new = h
|
||||
elif a2_new_unc <= l:
|
||||
a2_new = l
|
||||
else:
|
||||
a2_new = a2_new_unc
|
||||
else:
|
||||
b = self._b
|
||||
l1 = a1 + s * (a2 - L)
|
||||
h1 = a1 + s * (a2 - H)
|
||||
l1 = a1 + s * (a2 - l)
|
||||
h1 = a1 + s * (a2 - h)
|
||||
|
||||
# way 1
|
||||
f1 = y1 * (e1 + b) - a1 * K(i1, i1) - s * a2 * K(i1, i2)
|
||||
f2 = y2 * (e2 + b) - a2 * K(i2, i2) - s * a1 * K(i1, i2)
|
||||
f1 = y1 * (e1 + b) - a1 * k(i1, i1) - s * a2 * k(i1, i2)
|
||||
f2 = y2 * (e2 + b) - a2 * k(i2, i2) - s * a1 * k(i1, i2)
|
||||
ol = (
|
||||
l1 * f1
|
||||
+ L * f2
|
||||
+ 1 / 2 * l1**2 * K(i1, i1)
|
||||
+ 1 / 2 * L**2 * K(i2, i2)
|
||||
+ s * L * l1 * K(i1, i2)
|
||||
+ l * f2
|
||||
+ 1 / 2 * l1**2 * k(i1, i1)
|
||||
+ 1 / 2 * l**2 * k(i2, i2)
|
||||
+ s * l * l1 * k(i1, i2)
|
||||
)
|
||||
oh = (
|
||||
h1 * f1
|
||||
+ H * f2
|
||||
+ 1 / 2 * h1**2 * K(i1, i1)
|
||||
+ 1 / 2 * H**2 * K(i2, i2)
|
||||
+ s * H * h1 * K(i1, i2)
|
||||
+ h * f2
|
||||
+ 1 / 2 * h1**2 * k(i1, i1)
|
||||
+ 1 / 2 * h**2 * k(i2, i2)
|
||||
+ s * h * h1 * k(i1, i2)
|
||||
)
|
||||
"""
|
||||
# way 2
|
||||
@ -362,9 +362,9 @@ class SmoSVM:
|
||||
objectives
|
||||
"""
|
||||
if ol < (oh - self._eps):
|
||||
a2_new = L
|
||||
a2_new = l
|
||||
elif ol > oh + self._eps:
|
||||
a2_new = H
|
||||
a2_new = h
|
||||
else:
|
||||
a2_new = a2
|
||||
|
||||
|
@ -83,7 +83,7 @@ the third document in the corpus.")
|
||||
return (len([doc for doc in docs if term in doc]), len(docs))
|
||||
|
||||
|
||||
def inverse_document_frequency(df: int, N: int, smoothing=False) -> float:
|
||||
def inverse_document_frequency(df: int, n: int, smoothing=False) -> float:
|
||||
"""
|
||||
Return an integer denoting the importance
|
||||
of a word. This measure of importance is
|
||||
@ -109,15 +109,15 @@ def inverse_document_frequency(df: int, N: int, smoothing=False) -> float:
|
||||
1.477
|
||||
"""
|
||||
if smoothing:
|
||||
if N == 0:
|
||||
if n == 0:
|
||||
raise ValueError("log10(0) is undefined.")
|
||||
return round(1 + log10(N / (1 + df)), 3)
|
||||
return round(1 + log10(n / (1 + df)), 3)
|
||||
|
||||
if df == 0:
|
||||
raise ZeroDivisionError("df must be > 0")
|
||||
elif N == 0:
|
||||
elif n == 0:
|
||||
raise ValueError("log10(0) is undefined.")
|
||||
return round(log10(N / df), 3)
|
||||
return round(log10(n / df), 3)
|
||||
|
||||
|
||||
def tf_idf(tf: int, idf: int) -> float:
|
||||
|
Reference in New Issue
Block a user