In [1]:
import pandas as pd
# 加载数据集并显示数据集的前五行 1分
data = pd.read_csv('auto-mpg.csv')
print("数据集的前五行:")
print(data.head())
# 显示每一列的数据类型
print(data.dtypes)
# 检查缺失值并删除缺失值所在的行 2分
print("\n检查缺失值:")
print(data.isnull().sum())
data = data.dropna()
# 将 'horsepower' 列转换为数值类型,并处理转换中的异常值 1分
data['horsepower'] = pd.to_numeric(data['horsepower'], errors='coerce')
data = data.dropna(subset=['horsepower'])
# 显示每一列的数据类型
print(data.horsepower.dtypes)
# 检查清洗后的缺失值
print("\n检查清洗后的缺失值:")
print(data.isnull().sum())
from sklearn.preprocessing import StandardScaler
# 对数值型数据进行标准化处理 1分
numerical_features = ['displacement', 'horsepower', 'weight', 'acceleration']
scaler = StandardScaler()
data[numerical_features] = scaler.fit_transform(data[numerical_features])
from sklearn.model_selection import train_test_split
# 选择特征和目标变量 2分
selected_features = ['cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model year', 'origin']
X = data[selected_features]
y = data['mpg']
# 划分数据集为训练集和测试集 1分
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 将特征和目标变量合并到一个数据框中
cleaned_data = X.copy()
cleaned_data['mpg'] = y
# 保存清洗和处理后的数据
cleaned_data.to_csv('2.1.1_cleaned_data.csv', index=False)
# 打印消息指示文件已保存
print("\n清洗后的数据已保存到 2.1.1_cleaned_data.csv")
数据集的前五行:
mpg cylinders displacement horsepower weight acceleration model year \
0 18.0 8.0 NaN 130 3504.0 12.0 70.0
1 15.0 8.0 350.0 165 3693.0 11.5 70.0
2 18.0 8.0 318.0 150 3436.0 11.0 70.0
3 16.0 8.0 304.0 150 3433.0 12.0 70.0
4 17.0 8.0 302.0 140 3449.0 10.5 70.0
origin car name
0 1 chevrolet chevelle malibu
1 1 buick skylark 320
2 1 plymouth satellite
3 1 amc rebel sst
4 1 ford torino
mpg float64
cylinders float64
displacement float64
horsepower object
weight float64
acceleration float64
model year float64
origin int64
car name object
dtype: object
检查缺失值:
mpg 0
cylinders 1
displacement 3
horsepower 1
weight 2
acceleration 1
model year 1
origin 0
car name 0
dtype: int64
float64
检查清洗后的缺失值:
mpg 0
cylinders 0
displacement 0
horsepower 0
weight 0
acceleration 0
model year 0
origin 0
car name 0
dtype: int64
清洗后的数据已保存到 2.1.1_cleaned_data.csv
In [ ]: