6. 데이터 전처리 (결측치와 이상치)

import pandas as pd 
# 예제 데이터프레임 생성 

data = {'Name': ['Alice', 'Bob', 'Charlie'], # 이름 
         'Age': [25, None, 30], # 나이 (Bob의 나이가 비어 있음)
         'Score': [90, 85, None]} # 점수 (Charlie's 점수가 비어 있음) 
df = pd.DataFrame(data) 

# 결측치 여부 확인 
print("결측치 여부 확인:")
print(df.isnull())

# Pandas를 이용한 결측치 확인 및 처리 튜토리얼

import pandas as pd

# 예제 데이터프레임 생성
data = {
    'Name': ['Alice', 'Bob', 'Charlie'],  # 이름
    'Age': [25, None, 30],  # 나이 (Bob의 나이가 결측치)
    'Score': [90, 85, None]  # 점수 (Charlie's 점수가 결측치)
}
df = pd.DataFrame(data)

# 1. 결측치 확인

## ✅ 데이터프레임 출력
print(df)

## ✅ 결측치 개수 확인
print(df.isnull().sum())  # 각 컬럼별 결측치 개수

## ✅ 결측치 여부 확인 (True / False)
print(df.isnull())  # 결측치가 있는 위치를 True로 표시

## ✅ 결측치가 포함된 행 확인
print(df[df.isnull().any(axis=1)])  # 결측치가 하나라도 있는 행 출력

## ✅ 결측치가 포함된 열 확인
print(df.isnull().sum(axis=0))  # 각 열에서 결측치 개수 확인

# 2. 결측치 처리 (제거)

## ✅ 결측치가 포함된 행 삭제
df_dropped_rows = df.dropna()
print(df_dropped_rows)

## ✅ 결측치가 포함된 열 삭제
df_dropped_columns = df.dropna(axis=1)
print(df_dropped_columns)

# 3. 결측치 대체 (채우기)

## ✅ 특정 값으로 채우기
df_filled_zero = df.fillna(0)  # 결측치를 0으로 채움
print(df_filled_zero)

df_filled_string = df.fillna("Unknown")  # 결측치를 "Unknown"으로 채움
print(df_filled_string)

## ✅ 컬럼별 다른 값으로 채우기
df_filled_custom = df.fillna({"Age": df["Age"].mean(), "Score": df["Score"].median()})
print(df_filled_custom)

## ✅ 바로 적용 (inplace=True)
df.fillna(0, inplace=True)  # 원본 데이터프레임 수정

# 4. 결측치 대체 (통계값 활용)

## ✅ 평균(mean) 값으로 대체
df["Age"].fillna(df["Age"].mean(), inplace=True)
df["Score"].fillna(df["Score"].mean(), inplace=True)

## ✅ 중앙값(median)으로 대체
df["Age"].fillna(df["Age"].median(), inplace=True)

## ✅ 최빈값(mode)으로 대체
df["Age"].fillna(df["Age"].mode()[0], inplace=True)  # 최빈값이 여러 개일 경우 첫 번째 값 사용

# 5. 결측치 보간법 (Interpolation)

## ✅ 선형 보간 (Linear Interpolation)
df.interpolate(method="linear", inplace=True)

## ✅ 앞 값으로 채우기 (forward fill)
df.fillna(method="ffill", inplace=True)

## ✅ 뒷 값으로 채우기 (backward fill)
df.fillna(method="bfill", inplace=True)

# 최종 데이터프레임 확인
print(df)

# Pandas를 이용한 이상치 탐지 및 처리 튜토리얼

import pandas as pd
import numpy as np
from scipy import stats

# 예제 데이터 생성 (이상치 포함)
data = {
    "Name": ["Alice", "Bob", "Charlie", "David", "Eve", "Frank"],
    "Age": [25, 30, 28, 22, 150, 27],  # '150'은 이상치
    "Score": [90, 85, 88, 92, 87, 300]  # '300'은 이상치
}
df = pd.DataFrame(data)

# 1. 이상치 확인

## ✅ 데이터 확인
print(df)

## ✅ 기본 통계 확인 (describe)
print(df.describe())

## ✅ IQR(Interquartile Range, 사분위 범위) 기반 이상치 탐지
Q1 = df["Age"].quantile(0.25)
Q3 = df["Age"].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
print(f"Age 이상치 범위: {lower_bound} ~ {upper_bound}")

# 이상치 데이터 출력
outliers_iqr = df[(df["Age"] < lower_bound) | (df["Age"] > upper_bound)]
print(outliers_iqr)

# 2. Z-score를 이용한 이상치 탐지

## ✅ Z-score 계산
df["Age_Zscore"] = np.abs(stats.zscore(df["Age"]))
df["Score_Zscore"] = np.abs(stats.zscore(df["Score"]))

## ✅ Z-score 기준 이상치 탐지 (Z-score > 3)
outliers_z = df[(df["Age_Zscore"] > 3) | (df["Score_Zscore"] > 3)]
print(outliers_z)

# 3. 이상치 처리 (제거)

## ✅ IQR 이상치 제거
df_cleaned_iqr = df[(df["Age"] >= lower_bound) & (df["Age"] <= upper_bound)]
print(df_cleaned_iqr)

## ✅ Z-score 기준 이상치 제거
df_cleaned_z = df[(df["Age_Zscore"] <= 3) & (df["Score_Zscore"] <= 3)]
print(df_cleaned_z)

# 4. 이상치 대체 (대체 방법)

## ✅ 평균(mean)으로 대체
df.loc[df["Age"] > upper_bound, "Age"] = df["Age"].mean()
df.loc[df["Score_Zscore"] > 3, "Score"] = df["Score"].mean()

## ✅ 중앙값(median)으로 대체
df.loc[df["Age"] > upper_bound, "Age"] = df["Age"].median()

## ✅ 최빈값(mode)으로 대체
df.loc[df["Age"] > upper_bound, "Age"] = df["Age"].mode()[0]

# 5. 이상치 보간법 (Interpolation)

## ✅ 선형 보간법 적용
df["Age"] = df["Age"].interpolate(method="linear")
df["Score"] = df["Score"].interpolate(method="linear")

# 최종 데이터 확인
print(df)


#####################################
교재
#####################################

1.데이터정제
(1) 결측치 확인과 처리
isnull()의 기본 사용법

import pandas as pd 
# 예제 데이터프레임 생성 

data = {'Name': ['Alice', 'Bob', 'Charlie'], # 이름 
         'Age': [25, None, 30], # 나이 (Bob의 나이가 비어 있음)
         'Score': [90, 85, None]} # 점수 (Charlie's 점수가 비어 있음) 
df = pd.DataFrame(data) 

# 결측치 여부 확인 
print("결측치 여부 확인:")
print(df.isnull())
df.isnull()

# 컬럼별 결측치 개수 확인
print("\n컬럼별 결측치 개수:")
print(df.isnull().sum())

# 결측치가 있는 행 삭제
df_cleaned = df.dropna()
print("\n결측치 삭제 후 데이터:")
print(df_cleaned)

# Age 열의 평균값으로 결측치 대체
mean_age = df['Age'].mean()  # 평균값 계산
df['Age'] = df['Age'].fillna(mean_age)
print("\nAge 열 평균값으로 결측치 대체:")
print(df)

# Score 열의 결측치를 0으로 대체
df['Score'] = df['Score'].fillna(0)
print("\nScore 열 0으로 결측치 대체:")
print(df)

import pandas as pd

# 데이터 생성
data = {'Name': ['Alice', 'Bob', 'Charlie'],
        'Age': [25, None, 30],
        'Score': [90, 85, None]}
df = pd.DataFrame(data)

# 결측치 확인
print("결측치 확인:")
print(df.isnull().sum())

# 결측치 처리
# 1. Age 열의 결측치를 평균값으로 대체
mean_age = df['Age'].mean()
df['Age'] = df['Age'].fillna(mean_age)

# 2. Score 열의 결측치를 0으로 대체
df['Score'] = df['Score'].fillna(0)

print("\n결측치 처리 후 데이터:")
print(df)

import pandas as pd
# 예제 데이터프레임 생성
data = {'Height': [150, 160, 170, 180, 350]}  # 300은 이상치로 가정
df = pd.DataFrame(data)

# 데이터 출력
print("데이터:\n", df)

# 최대값과 최소값 확인
print("\nHeight의 최대값:", df['Height'].max())
print("Height의 최소값:", df['Height'].min())

# IQR을 이용한 이상치 확인
Q1 = df['Height'].quantile(0.25)  # 1사분위 값
Q3 = df['Height'].quantile(0.75)  # 3사분위 값
IQR = Q3 - Q1  # IQR 계산

# 이상치 기준 계산
lower_bound = Q1 - 1.5 * IQR  # 하한
upper_bound = Q3 + 1.5 * IQR  # 상한

# 이상치 확인
outliers = df[(df['Height'] < lower_bound) | (df['Height'] > upper_bound)]
print("\nIQR 기준으로 찾은 이상치:\n", outliers)


from scipy.stats import zscore

# Z-스코어 계산
df['Z_Score'] = zscore(df['Height'])  # Z-스코어 계산
print("\nZ-스코어:\n", df)

# Z-스코어 기준으로 이상치 탐지
outliers = df[df['Z_Score'].abs() > 3]
print("\nZ-스코어 기준으로 찾은 이상치:\n", outliers)

# IQR 기준으로 이상치 제거
df_cleaned = df[(df['Height'] >= lower_bound) & (df['Height'] <= upper_bound)]
print("\n이상치 제거 후 데이터:\n", df_cleaned)

# 이상치를 중앙값으로 대체
median_value = df[(df['Height'] >= lower_bound) & (df['Height'] <= upper_bound)]['Height'].median()
df['Height'] = df['Height'].apply(lambda x: median_value if (x < lower_bound or x > upper_bound) else x)
print("\n이상치를 중앙값으로 대체한 데이터:\n", df)

import pandas as pd
from scipy.stats import zscore

# 데이터 생성
data = {'Height': [150, 160, 170, 180, 350]}
df = pd.DataFrame(data)
# IQR 방법
Q1 = df['Height'].quantile(0.25)
Q3 = df['Height'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Z-스코어 계산
df['Z_Score'] = zscore(df['Height'])

# 이상치 제거
df_cleaned = df[(df['Height'] >= lower_bound) & (df['Height'] <= upper_bound)]

# 이상치를 중앙값으로 대체
median_value = df_cleaned['Height'].median()
df['Height'] = df['Height'].apply(lambda x: median_value if (x < lower_bound or x > upper_bound) else x)

print("최종 데이터:\n", df)

'파이썬' 카테고리의 다른 글

8. 데이터 병합과 추가 (0)	2025.03.18
7. 데이터전처리 (0)	2025.03.18
5. 데이터선택하기 (0)	2025.03.17
4. 데이터불러오기 및 저장하기 (0)	2025.03.17
3. 파이썬 자료형 튜토리얼 (0)	2025.03.17

WORKaHOLIC

6. 데이터 전처리 (결측치와 이상치)

'파이썬' 카테고리의 다른 글

티스토리툴바

6. 데이터 전처리 (결측치와 이상치)

'파이썬' 카테고리의 다른 글

'파이썬' Related Articles

티스토리툴바