In [1]:
import pandas as pd

# 加载数据集并显示数据集的前五行 1分
data = pd.read_csv('auto-mpg.csv')
print("数据集的前五行:")
print(data.head())

# 显示每一列的数据类型
print(data.dtypes)

# 检查缺失值并删除缺失值所在的行  2分
print("\n检查缺失值:")
print(data.isnull().sum())  
data = data.dropna()

# 将 'horsepower' 列转换为数值类型,并处理转换中的异常值 1分
data['horsepower'] = pd.to_numeric(data['horsepower'], errors='coerce')
data = data.dropna(subset=['horsepower'])

# 显示每一列的数据类型
print(data.horsepower.dtypes)

# 检查清洗后的缺失值
print("\n检查清洗后的缺失值:")
print(data.isnull().sum())

from sklearn.preprocessing import StandardScaler
# 对数值型数据进行标准化处理 1分
numerical_features = ['displacement', 'horsepower', 'weight', 'acceleration']
scaler = StandardScaler()
data[numerical_features] = scaler.fit_transform(data[numerical_features])

from sklearn.model_selection import train_test_split
# 选择特征和目标变量 2分
selected_features = ['cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model year', 'origin']
X = data[selected_features]
y = data['mpg']

# 划分数据集为训练集和测试集 1分
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# 将特征和目标变量合并到一个数据框中
cleaned_data = X.copy()
cleaned_data['mpg'] = y

# 保存清洗和处理后的数据
cleaned_data.to_csv('2.1.1_cleaned_data.csv', index=False)

# 打印消息指示文件已保存
print("\n清洗后的数据已保存到 2.1.1_cleaned_data.csv")
数据集的前五行:
    mpg  cylinders  displacement horsepower  weight  acceleration  model year  \
0  18.0        8.0           NaN        130  3504.0          12.0        70.0   
1  15.0        8.0         350.0        165  3693.0          11.5        70.0   
2  18.0        8.0         318.0        150  3436.0          11.0        70.0   
3  16.0        8.0         304.0        150  3433.0          12.0        70.0   
4  17.0        8.0         302.0        140  3449.0          10.5        70.0   

   origin                   car name  
0       1  chevrolet chevelle malibu  
1       1          buick skylark 320  
2       1         plymouth satellite  
3       1              amc rebel sst  
4       1                ford torino  
mpg             float64
cylinders       float64
displacement    float64
horsepower       object
weight          float64
acceleration    float64
model year      float64
origin            int64
car name         object
dtype: object

检查缺失值:
mpg             0
cylinders       1
displacement    3
horsepower      1
weight          2
acceleration    1
model year      1
origin          0
car name        0
dtype: int64
float64

检查清洗后的缺失值:
mpg             0
cylinders       0
displacement    0
horsepower      0
weight          0
acceleration    0
model year      0
origin          0
car name        0
dtype: int64

清洗后的数据已保存到 2.1.1_cleaned_data.csv
In [ ]: