1장에서는 Numpy와 Pandas에 대해 배웁니다! 처음이라 정리가 조금 서툴지만, 앞으로 더 열심히 정리해 보겠습니다~~ 🙂‍↕️

1. 넘파이 - 루프 사용 X 대량 데이터의 배열 연산을 가능하게해 빠른 배열 연산 속도

- ndarry : 다차원 배열 쉽게 생성하고 다양한 연산

import numpy as np

#1. array() : ndarry로 변환하는 기능 

array1 = np.array([1,2,3])
array2= np.array([[1,2,3]])
print('array 1 type:',type(array1))
print('array 2 type:',type(array2))
print('array 1 array 형태:',array1.shape)

#2. ndim : array의 차원 

print('array 1 :{}차원 , array 2: {}차원'.format(array1.ndim , array2.ndim))

#3. astype : 타입변환

array_int = np.array([1,2,3])
array_float=array_int.astype('float64') # 타입을 문자열로 지정
print(array_float,array_float.dtype)

#4.ndarry 생성 - arrange , zeros , ones : 연속값 / 초기화  ( 테스트 데이터 or 대규모 테이터 일괄 초기화 )

sequence_array=np.arange(10) # 0부터 연속된 수 
zero_array =np.zeros((3,2))  # shape 입력 3행 2열 
one_array=np.ones((3,2))

#5.reshape() 차원 크기 변경 
array1 = np.arange(8)
array2=array1.reshape(2,4)
array3=array1.reshape(4,-1) # -1 사용하면 호환되는 shape로 변환 
array3d = array1.reshape((2,2,2)) # 1-> 3차원 : 2개의 2 x 2 
#array3d[0] =[[1, 2],[3, 4]]
#array3d[1] =[[5, 6], [7, 8]]
array2d = array3d.reshape((-1,1))  # reshape(-1,1)은 어떤 형태라도 칼럼 1개 가진 2차원으로 만듦

#6. 인덱싱 ,슬라이싱
value = array1[2]
value2= array2[0,1]
array4=array1[0:3]
array4=array1[array1>3]   #불린 인덱싱- 조건필터링 + 검색
# ex) array1d >5 -> array([F,F,T,T,F,T,T,F,F]) T/F로 이루어진 ndarry 객체 반환한걸 []에 다시넣음

#7. 행렬의 정렬  sort() , argsort() 
# np.sort(행렬) 반환 O , 원본 변화 X  VS
# 행렬.sort() 반환 X 원본 변화 O
# 내림차순 정렬 : [::-1]  - 리스트[start : stop : step]
# np.sort(array2d , axis =0 ) 로우 방향 정렬 

org_array = np.array([3,1,9,5])

sort_array1=np.sort(org_array)   
sort_array1=np.sort(org_array,axis=1)[::1]    
sort_array2=org_array.sort()  # 정렬된 행렬의 원본 행렬 인덱스 반환 

sort_indices = np.argsort(org_array)   # 정렬된 인덱스 반환 [0,1,2,3]
# argsort() 예시 
name_array = np.array(['a','b','c','d'])
score_array=np.array([78,95,84,98])
sort_score = np.argsort(score_array)
print(sort_score)  # 0 2 1 3
print(name_array[sort_score])  # a c b d  // 변수에 인덱스 넣어 해당하는 배열 반환 

# 선형대수연산 
# dot : 행렬 내적  , transpose() :전치 행렬 
A=np.array([[1,2,3],[4,5,6]])
B=np.array([[7,8],[9,10],[11,12]])
dot_product = np.dot(A,B)  # [[ 58  64] [139 154]]
transpose_mat=np.transpose(A)

array 1 type: <class 'numpy.ndarray'>
array 2 type: <class 'numpy.ndarray'>
array 1 array 형태: (3,)
array 1 :1차원 , array 2: 2차원
[1. 2. 3.] float64
[0 2 1 3]
['a' 'c' 'b' 'd']

2. 판다스 : 데이터 처리 라이브러리 , 2차원 데이터 효과적으로 데이터 담고 가공

* DataFrame : 2차원 데이터를 담는 데이터 구조체로 판다스의 핵심 객체

* Series , DataFrame 은 index를 key로 가지는데 series는 칼럼이 하나 , dataFrame은 칼럼이 여러개인 구조체

import pandas as pd

#read해서 DataFrame으로 생성하기 
titanic_df=pd.read_csv('train.csv')
titanic_df.head(3) #3행만 가져오기
print('Dataframe크기',titanic_df.shape) # Dataframe크기 (891, 12) -> 891row , 12col

# data 수 , 타입 , null수 등을 알 수있음  
titanic_df.info()  

 # 평균 , 최소 , 최대 (숫자형만 포함)
titanic_df.describe()

# [컬럼].value_counts() : ✔️ 고유값이 식별자 인덱스 역할 : 지정된 컬럼의 데이터값 건수 -> 데이터 분포도 : value , count수
value_counts =titanic_df['Pclass'].value_counts()
print('value_counts',value_counts)
print(titanic_df['Embarked'].value_counts(dropna=False))   # null도 포함 

#[컬럼] - series 객체 반환 : dataframe [] 내부에 컬럼명 입력 
titanic_pclass = titanic_df['Pclass'] # 인덱스 , 값 
titanic_pclass.head()
value_counts  = titanic_df['Pclass'].value_counts() # 칼럼 값 별 데이터 건수 ,null값 무시
value_counts  = titanic_df['Pclass'].value_counts(dropna=False)


# 1. ndarray ,list ,dictionary-> Dataframe 변환 
import numpy as np
col_name1=['col1'] # 몇 개 더 들어올 수있으니까 행렬로 컬럼명 만듦 
list1=[1,2,3]
array1 = np.array(list1)

df_list1 = pd.DataFrame(list1,columns=col_name1) # pd.df(리스트 , 컬럼명지정)
df_array1 = pd.DataFrame(array1,columns=col_name1)

print('1차원 리스트로 만든 DF :\n',df_list1)
print('1차원 ndarray로 만든 DF :\n',df_array1)

col_name2=['col1','col2','col3']
list2=[[1,2,3],[4,5,6]]
df_list2=pd.DataFrame(list2,columns=col_name2)
print('2차원리스트로 만든 DF :\n',df_list2)

dict = {'col1':[1,11],'col2':[2,22],'col3':[3,33]}
df_dict=pd.DataFrame(dict)
print('딕셔너리로 만든 DF:\n',df_dict) # key : 컬럼명 , value = 값 

# 2. Dataframe -> ndarray, list , dict 

# 1)ndarray로 만들려면 DF객체의 ✔️values  
array3 = df_dict.values

#2) list : df.values.tolist()
list3 = df_dict.values.tolist()

#3) dic : to_dict
dict3 = df_dict.to_dict('list')

Dataframe크기 (891, 12)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
value_counts Pclass
3    491
1    216
2    184
Name: count, dtype: int64
Embarked
S      644
C      168
Q       77
NaN      2
Name: count, dtype: int64
1차원 리스트로 만든 DF :
    col1
0     1
1     2
2     3
1차원 ndarray로 만든 DF :
    col1
0     1
1     2
2     3
2차원리스트로 만든 DF :
    col1  col2  col3
0     1     2     3
1     4     5     6
딕셔너리로 만든 DF:
    col1  col2  col3
0     1     2     3
1    11    22    33

In [69]:

#1. DataFrame의 칼럼 데이터 세트 생성과 수정
titanic_df['Age_0']=0 # 칼럼추가 및 모든 데이터 값 할당
titanic_df['Age_by_10']=titanic_df['Age']*10
titanic_df['Family_No']=titanic_df['SibSp']+titanic_df['Parch']+1
titanic_df['Age_by_10']=titanic_df['Age_by_10']+100
titanic_df.head(3)

#2.DataFrame 데이터 삭제 : drop() 
titanic_drop_df=titanic_df.drop('Age_0',axis=1)
titanic_drop_df.head(3)

# inplace = True 반환 X 원본삭제  
titanic_df.drop(['Age_0','Age_by],axis=1,inplace=True)

Out[69]:

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Ticket	Fare	Cabin	Embarked	Age_by_10	Family_No
0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	A/5 21171	7.2500	NaN	S	320.0	2
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	PC 17599	71.2833	C85	C	480.0	2
2	3	1	3	Heikkinen, Miss. Laina	female	26.0	0	STON/O2. 3101282	7.9250	NaN	S	360.0	1

In [23]:

import pandas as pd

#인덱스 객체 
titanic_df = pd.read_csv('train.csv')

indexes=titanic_df.index
print(indexes)
print('index 객체 array값 \n' , indexes.values[:50])

# reset_index() : 새롭게 인덱스 생김 , 기존인덱스는 index라는 새로운 칼럼명 
titanic_reset_df=titanic_df.reset_index(inplace=False)
titanic_reset_df.head(3)

print('### before reset_index ##')
value_counts = titanic_df['Pclass'].value_counts()
print(value_counts)
print('value_counts 객체 변수 타입:',type(value_counts))
new_value_counts=value_counts.reset_index(inplace=False)
print('### After reset index ##')
print(new_value_counts)
print('new_value_counts 객체 변수 타입 :',type(new_value_counts))

#넘파이 [] : 행,열의 위치 , 슬라이싱 범위 
#pandas [] 는 칼럼명문자 or 인덱스 표현법 
print(titanic_df['Pclass'].head(3))
titanic_df[0:2]
titanic_df[titanic_df['Pclass']==3].head(3)

RangeIndex(start=0, stop=891, step=1)
index 객체 array값 
 [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
 48 49]
### before reset_index ##
Pclass
3    491
1    216
2    184
Name: count, dtype: int64
value_counts 객체 변수 타입: <class 'pandas.core.series.Series'>
### After reset index ##
   Pclass  count
0       3    491
1       1    216
2       2    184
new_value_counts 객체 변수 타입 : <class 'pandas.core.frame.DataFrame'>
----------------
0    3
1    1
2    3
Name: Pclass, dtype: int64
----------------
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   

   Parch     Ticket     Fare Cabin Embarked  
0      0  A/5 21171   7.2500   NaN        S  
1      0   PC 17599  71.2833   C85        C

Out[23]:

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Ticket	Fare	Cabin	Embarked
0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	A/5 21171	7.250	NaN	S
2	3	1	3	Heikkinen, Miss. Laina	female	26.0	0	STON/O2. 3101282	7.925	NaN	S
4	5	0	3	Allen, Mr. William Henry	male	35.0	0	373450	8.050	NaN	S

In [41]:

# DataFrame iloc[] 연산자 : row , col 를 넣음 ➡️ 위치 정수값 or 정수 슬라이싱 

data ={'Name' :['A','B','C','D'],
       'Year':[2022,2023,2024,2025],
       'Gender':['M','M','F','F']
      }
data_df = pd.DataFrame(data,index=['one','two','three','four'])
data_df

Out[41]:

	Name	Year	Gender
one	A	2022	M
two	B	2023	M
three	C	2024	F
four	D	2025	F

In [42]:

print(data_df.iloc[0,0])
print(data_df.iloc[0:2,[0,1]])
print(data_df.iloc[0:3,0:3])
print('\n 맨 마지막 컬럼 데이터 [:,:-1]\n',data_df.iloc[:,-1])

A
    Name  Year
one    A  2022
two    B  2023
      Name  Year Gender
one      A  2022      M
two      B  2023      M
three    C  2024      F

 맨 마지막 컬럼 데이터 [:,:-1]
 one      M
two      M
three    F
four     F
Name: Gender, dtype: object

In [48]:

# DataFrame loc[] 연산자  -> 명칭 Label 기반 데이터 추출 loc[인덱스값, 컬럼명] 
print(data_df.loc['one','Name'])

# loc 에서 슬라이싱 기호는 종료값 포함
print('위치기반 iloc slicing\n',data_df.iloc[0:1,0],'\n')
print('명칭기반 loc slicing\n',data_df.loc['one':'two','Name'])

A
위치기반 iloc slicing
 one    A
Name: Name, dtype: object 

명칭기반 loc slicing
 one    A
two    B
Name: Name, dtype: object

In [60]:

#불린 인덱싱 [조건] 
titanic_df=pd.read_csv('train.csv')
titanic_boolean = titanic_df[titanic_df['Age']>60] 
print(titanic_df['Age']>60) #반환 : dataframe 

titanic_df[titanic_df['Age']>60][['Name','Age']].head(3) 

#loc을 사용해서 행 / 열 
titanic_df.loc[titanic_df['Age']>60 ,['Name','Age']].head(3)

0      False
1      False
2      False
3      False
4      False
       ...  
886    False
887    False
888    False
889    False
890    False
Name: Age, Length: 891, dtype: bool

Out[60]:

	Name	Age
33	Wheadon, Mr. Edward H	66.0
54	Ostby, Mr. Engelhart Cornelius	65.0
96	Goldschmidt, Mr. George B	71.0

In [63]:

#dataframe , series 의 정렬 - sort_values() 
titanic_sorted=titanic_df.sort_values(by='Name')
titanic_sorted.head(5)

Out[63]:

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked
845	846	0	3	Abbing, Mr. Anthony	male	42.0	0	0	C.A. 5547	7.55	NaN	S
746	747	0	3	Abbott, Mr. Rossmore Edward	male	16.0	1	1	C.A. 2673	20.25	NaN	S
279	280	1	3	Abbott, Mrs. Stanton (Rosa Hunt)	female	35.0	1	1	C.A. 2673	20.25	NaN	S
308	309	0	2	Abelson, Mr. Samuel	male	30.0	1	0	P/PP 3381	24.00	NaN	C
874	875	1	2	Abelson, Mrs. Samuel (Hannah Wizosky)	female	28.0	1	0	P/PP 3381	24.00	NaN	C

In [81]:

#Aggregation 함수적용 - min , max , sum ,count
print(titanic_df.count(),'\n' ) #  null은 포함 X
print(titanic_df['Age'].mean(),'\n')

#groupby -> 또 다른 형태의 Dataframe 생성됨 - 대상컬럼 제외한 모든 컬럼에 aggregation 함수 적용됨 
titanic_groupby = titanic_df.groupby(by='Pclass').count()
print(titanic_groupby,'\n')

#여러개의 agg 적용가능
print(titanic_df.groupby('Pclass')['Age'].agg(['max', 'min']))

# 여러개의 칼럼에 서로 다른 agg 함수 적용 -> 딕셔너리 형태로 
agg_format ={'Age':'max','SibSp':'sum','Fare':'mean'}
titanic_df.groupby('Pclass').agg(agg_format)

PassengerId    891
Survived       891
Pclass         891
Name           891
Sex            891
Age            714
SibSp          891
Parch          891
Ticket         891
Fare           891
Cabin          204
Embarked       889
dtype: int64 

29.69911764705882 

        PassengerId  Survived  Name  Sex  Age  SibSp  Parch  Ticket  Fare  \
Pclass                                                                      
1               216       216   216  216  186    216    216     216   216   
2               184       184   184  184  173    184    184     184   184   
3               491       491   491  491  355    491    491     491   491   

        Cabin  Embarked  
Pclass                   
1         176       214  
2          16       184  
3          12       491   

         max   min
Pclass            
1       80.0  0.92
2       70.0  0.67
3       74.0  0.42

Out[81]:

	Age	SibSp	Fare
Pclass
1	80.0	90	84.154687
2	70.0	74	20.662183
3	74.0	302	13.675550

In [94]:

# 결손 데이터 처리하기 

# NaN 여부확인 isna()
print(titanic_df[['Name', 'Age']].isna().head(3),'\n') # == titanic_df.iloc[:, 0:3].isna().head(3)
print(titanic_df.isna().sum(),'\n')

#다른 값 대체 fillna()
titanic_df['Age']=titanic_df['Age'].fillna(titanic_df['Age'].mean())
titanic_df['Age'].isna().sum()

    Name    Age
0  False  False
1  False  False
2  False  False 

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       2
dtype: int64

Out[94]:

np.int64(0)

In [97]:

#람다
titanic_df['Name_len']=titanic_df['Name'].apply(lambda x : len(x))

titanic_df['Child_Adult']=titanic_df['Age'].apply(lambda x : 'Child' if x<=15 else 'Adult')
titanic_df[['Age','Child_Adult']].head(8)

Out[97]:

	Age	Child_Adult
0	22.000000	Adult
1	38.000000	Adult
2	26.000000	Adult
3	35.000000	Adult
4	35.000000	Adult
5	29.699118	Adult
6	54.000000	Adult
7	2.000000	Child

[ML_week4_chap3] 평가 (0)	2026.02.10
[ML_week3_chap5] 회귀 (0)	2026.02.03
[ML_week2_chap6] 차원축소 (0)	2026.01.27
[ML_week2_chap2] 사이킷런 (0)	2026.01.24
[ML_week1_chap10] 시각화 (0)	2026.01.19

My blog

My blog

[ML_week1_chap1] Numpy , Pandas 본문

[ML_week1_chap1] Numpy , Pandas

'ML' 카테고리의 다른 글

티스토리툴바

« 2026/03 »
일	월	화	수	목	금	토
1	2	3	4	5	6	7
8	9	10	11	12	13	14
15	16	17	18	19	20	21
22	23	24	25	26	27	28
29	30	31