[数据集]1我正在尝试使用Python实现随机梯度下降的线性回归。我已经有了实现这个的代码,但不知为何,在”row[column] = float(row[column].strip())”这一行触发了一个错误 – “无法将字符串转换为浮点数: ‘C'”。任何能够帮助我解决这个错误的人将不胜感激。
# 使用随机梯度下降的线性回归,用于Pima-印第安人-糖尿病数据库 from random import seed from random import randrange from csv import reader from math import sqrt filename = 'C:/Users/Vince/Desktop/University of Wyoming PHD/Year 2/Machine Learning/Homeworks/Solutions/HW4/pima-indians-diabetes-training.csv' # 加载CSV文件 def load_csv(filename): dataset = list() with open(filename, 'r') as file: csv_reader = reader(filename) for row in csv_reader: if not row: continue dataset.append(row) return dataset # 将字符串列转换为浮点数 def str_column_to_float(dataset, column): for row in dataset: row[column] = float(row[column].strip()) # 查找每列的最小值和最大值 def dataset_minmax(dataset): minmax = list() for i in range(len(dataset[0])): col_values = [row[i] for row in dataset] value_min = min(col_values) value_max = max(col_values) minmax.append([value_min, value_max]) return minmax # 将数据集列缩放到0-1范围 def normalize_dataset(dataset, minmax): for row in dataset: for i in range(len(row)): row[i] = (row[i] - minmax[i][0]) / (minmax[i][1] - minmax[i][0]) # 将数据集分割成k折 def cross_validation_split(dataset, n_folds): dataset_split = list() dataset_copy = list(dataset) fold_size = int(len(dataset) / n_folds) for i in range(n_folds): fold = list() while len(fold) < fold_size: index = randrange(len(dataset_copy)) fold.append(dataset_copy.pop(index)) dataset_split.append(fold) return dataset_split# 计算均方根误差def rmse_metric(actual, predicted):sum_error = 0.0for i in range(len(actual)): prediction_error = predicted[i] - actual[i] sum_error += (prediction_error ** 2)mean_error = sum_error / float(len(actual))return sqrt(mean_error)# 使用交叉验证分割评估算法def evaluate_algorithm(dataset, algorithm, n_folds, *args):folds = cross_validation_split(dataset, n_folds)scores = list()for fold in folds: train_set = list(folds) train_set.remove(fold) train_set = sum(train_set, []) test_set = list() for row in fold: row_copy = list(row) test_set.append(row_copy) row_copy[-1] = None predicted = algorithm(train_set, test_set, *args) actual = [row[-1] for row in fold] rmse = rmse_metric(actual, predicted) scores.append(rmse)return scores# 使用系数进行预测def predict(row, coefficients):yhat = coefficients[0]for i in range(len(row)-1): yhat += coefficients[i + 1] * row[i]return yhat# 使用随机梯度下降估计线性回归系数def coefficients_sgd(train, l_rate, n_epoch):coef = [0.0 for i in range(len(train[0]))]for epoch in range(n_epoch): for row in train: yhat = predict(row, coef) error = yhat - row[-1] coef[0] = coef[0] - l_rate * error for i in range(len(row)-1): coef[i + 1] = coef[i + 1] - l_rate * error * row[i] # print(l_rate, n_epoch, error) return coef # 使用随机梯度下降的线性回归算法 def linear_regression_sgd(train, test, l_rate, n_epoch): predictions = list() coef = coefficients_sgd(train, l_rate, n_epoch) for row in test: yhat = predict(row, coef) predictions.append(yhat) return(predictions) # 在Pima印第安人数据库上进行线性回归 seed(1)# 加载和准备数据filename = 'C:/Users/Vince/Desktop/University of Wyoming PHD/Year 2/Machine Learning/Homeworks/Solutions/HW4/pima-indians-diabetes-training.csv'dataset = load_csv(filename)for i in range(len(dataset[0])):str_column_to_float(dataset, i)# 标准化minmax = dataset_minmax(dataset)normalize_dataset(dataset, minmax)# 评估算法n_folds = 5l_rate = 0.01n_epoch = 5 0scores = evaluate_algorithm(dataset, linear_regression_sgd, n_folds, l_rate, n_epoch)print('Scores: %s' % scores)print('Mean RMSE: %.3f' % (sum(scores)/float(len(scores))))
回答:
补充@Agni的回答
你正在读取的CSV文件包含一个标题行
num_preg PlGlcConc BloodP tricept insulin BMI ped_func Age HasDiabetes
当你使用reader(file)
读取文件并迭代时,第一行也会被添加到dataset
中。因此,dataset
列表中的第一个元素是:
>>> dataset[['num_preg', 'PlGlcConc', 'BloodP', 'tricept', 'insulin', 'BMI', 'ped_func', 'Age', 'HasDiabetes'], ...]
所以当你尝试将其转换为浮点数时,会抛出错误,无法将字符串转换为浮点数): numpreg
这是最终编辑后的代码:
def load_csv(filename): dataset = list() with open(filename, 'r') as file: csv_reader = reader(file) fieldnames = next(csv_reader) # 跳过第一行并存储以防需要 dataset = list(csv_reader) # 你可以直接将迭代器转换为列表 return dataset