retail = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/6. 데이터 분석/OnlineRetail.csv')
retail # 데이터가 너무 많아서 중간에 생략됨
pd.options.display.max_rows = 6
retail # 위 아래 3개씩 데이터 보기
retail.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 InvoiceNo 541909 non-null object
1 StockCode 541909 non-null object
2 Description 540455 non-null object
3 Quantity 541909 non-null int64
4 InvoiceDate 541909 non-null object
5 UnitPrice 541909 non-null float64
6 CustomerID 406829 non-null float64
7 Country 541909 non-null object
dtypes: float64(2), int64(1), object(5)
memory usage: 33.1+ MB
# info를 보고 고객아이디, 설명이 null값이 포함 된 것을 확인
pd.options.display.max_rows = 10
# 각 필드당 null이 몇개 있는지 확인
retail.isnull().sum() # 고객아이디가 null인것은 비회원 주문을 뜻ㅅ함
InvoiceNo 0
StockCode 0
Description 1454
Quantity 0
InvoiceDate 0
UnitPrice 0
CustomerID 135080
Country 0
dtype: int64
# 각 필드당 null이 몇% 있는지 확인
retail.isnull().mean()
InvoiceNo 0.000000
StockCode 0.000000
Description 0.002683
Quantity 0.000000
InvoiceDate 0.000000
UnitPrice 0.000000
CustomerID 0.249267
Country 0.000000
dtype: float64
# 고객의 총 지출비용(CheckoutPrice) 파생변수 추가하기
# 수량 * 가격
retail['CheckoutPrice'] = retail['UnitPrice'] * retail['Quantity']
retail
retail['Country'].value_counts()
United Kingdom 271313
Germany 6923
France 6335
EIRE 5654
Spain 1983
...
Lithuania 35
Brazil 31
Bahrain 17
Czech Republic 14
Saudi Arabia 8
Name: Country, Length: 37, dtype: int64
retail['Country'].value_counts()
United Kingdom 271313
Germany 6923
France 6335
EIRE 5654
Spain 1983
Netherlands 1723
Belgium 1536
Switzerland 1403
Portugal 1130
Australia 945
Norway 836
Italy 646
Channel Islands 618
Cyprus 535
Finland 508
Austria 314
Sweden 290
Denmark 289
Poland 275
Japan 220
Unspecified 191
Israel 177
Singapore 167
Iceland 139
USA 131
Greece 122
Malta 102
Canada 101
RSA 54
European Community 52
United Arab Emirates 51
Lebanon 43
Lithuania 35
Brazil 31
Bahrain 17
Czech Republic 14
Saudi Arabia 8
Name: Country, dtype: int64
# 국가별 매출
rev_by_countries = retail.groupby('Country')['CheckoutPrice'].sum().sort_values() # 큰 금액순으로 오름차순정렬
rev_by_countries
Country
Saudi Arabia 140.88
Bahrain 548.40
Czech Republic 586.66
RSA 981.97
Brazil 1123.20
...
France 183549.73
Germany 202050.01
EIRE 236665.02
Netherlands 245279.99
United Kingdom 6459137.29
Name: CheckoutPrice, Length: 37, dtype: float64
plot = rev_by_countries.plot(kind='bar', figsize=(20, 10))
plot.set_xlabel('Country', fontsize=12)
plot.set_ylabel('Revenue', fontsize=12)
plot.set_title('Revenue By Country', fontsize=15)
plot.set_xticklabels(labels=rev_by_countries.index, rotation=45)
↪
[Text(0, 0, 'Saudi Arabia'),
Text(1, 0, 'Bahrain'),
Text(2, 0, 'Czech Republic'),
Text(3, 0, 'RSA'),
Text(4, 0, 'Brazil'),
Text(5, 0, 'European Community'),
Text(6, 0, 'United Arab Emirates'),
Text(7, 0, 'Lithuania'),
Text(8, 0, 'Lebanon'),
Text(9, 0, 'Unspecified'),
Text(10, 0, 'Malta'),
Text(11, 0, 'USA'),
Text(12, 0, 'Canada'),
Text(13, 0, 'Iceland'),
Text(14, 0, 'Greece'),
Text(15, 0, 'Israel'),
Text(16, 0, 'Poland'),
Text(17, 0, 'Austria'),
Text(18, 0, 'Cyprus'),
Text(19, 0, 'Italy'),
Text(20, 0, 'Denmark'),
Text(21, 0, 'Channel Islands'),
Text(22, 0, 'Singapore'),
Text(23, 0, 'Finland'),
Text(24, 0, 'Sweden'),
Text(25, 0, 'Portugal'),
Text(26, 0, 'Japan'),
Text(27, 0, 'Norway'),
Text(28, 0, 'Belgium'),
Text(29, 0, 'Switzerland'),
Text(30, 0, 'Spain'),
Text(31, 0, 'Australia'),
Text(32, 0, 'France'),
Text(33, 0, 'Germany'),
Text(34, 0, 'EIRE'),
Text(35, 0, 'Netherlands'),
Text(36, 0, 'United Kingdom')]
rev_by_countries / total_revenue # 전체 매출에서 각 국가의 매출 비중(퍼센트) 위의 표에서처럼 영국이 압도적으로 가장 큰 비중을 차지
↪
Country
Saudi Arabia 0.000018
Bahrain 0.000070
Czech Republic 0.000075
RSA 0.000125
Brazil 0.000143
...
France 0.023341
Germany 0.025693
EIRE 0.030095
Netherlands 0.031191
United Kingdom 0.821364
Name: CheckoutPrice, Length: 37, dtype: float64
# 월별 매출 구하기
retail['InvoiceDate'].sort_values(ascending=False) # 년도가 다른 것은 정제시키자
↪
541908 2011-12-09 12:50:00
541907 2011-12-09 12:50:00
541906 2011-12-09 12:50:00
541905 2011-12-09 12:50:00
541903 2011-12-09 12:50:00
...
3 2010-12-01 08:26:00
1 2010-12-01 08:26:00
5 2010-12-01 08:26:00
6 2010-12-01 08:26:00
0 2010-12-01 08:26:00
Name: InvoiceDate, Length: 304911, dtype: datetime64[ns]
# 그룹을 맺으면 안됨. 같은 달 같은 시간인 애들끼리 또 세부로 묶여서 월별로 묶는게 안됨.
# 예) 20-11-03-01 입력
def extract_month(date):
month = str(date.month) # 3이 뽑힘
if date.month < 10:
month = '0' + month # 03이 됨
return str(date.year) + month # 201103이 리턴됨
rev_by_month = retail.set_index('InvoiceDate').groupby(extract_month)['CheckoutPrice'].sum()
rev_by_month
# set_index('InvoiceDate') 이 인덱스로 세팅됨
# groupby(extract_month) 201103으로 그룹화시킴
↪
InvoiceDate
201012 509239.10
201101 516132.35
201102 388097.54
201103 521947.66
201104 403484.66
...
201108 564748.05
201109 839985.55
201110 918071.63
201111 1028772.36
201112 474144.60
Name: CheckoutPrice, Length: 13, dtype: float64
def plot_bar(df, xlabel, ylabel, title, rotation=45, titlesize=15, fontsize=12, figsize=(20,10)): # 기본값들 설정
plot = df.plot(kind='bar', figsize=figsize) # figsize를 튜플로 받음
plot.set_xlabel('Country', fontsize=fontsize)
plot.set_ylabel('Revenue', fontsize=fontsize)
plot.set_title(title, fontsize=titlesize)
plot.set_xticklabels(labels=df.index, rotation=rotation)
plot_bar(rev_by_month, 'Month', 'Revenue', 'Revenue By Month') #
# 요일별 매출 구하기
def extract_dow(date):
return date.dayofweek
rev_by_dow = retail.set_index('InvoiceDate')
.groupby(lambda date: date.dayofweek)['CheckoutPrice'].sum() # 잘 이해 안됨
rev_by_dow
# 시각화하기엔 부족한 데이터
# 인덱스를 변환시켜 시각화시켜 보자
↪
InvoiceDate
0 1207916.45
1 1509635.68
2 1386213.36
3 1739873.53
4 1329026.13
6 691251.35
Name: CheckoutPrice, dtype: float64
# 강사님
# 시간대별 매출 구하기
rev_by_hour = retail.set_index('InvoiceDate').groupby(lambda date: date.hour)['CheckoutPrice'].sum()
rev_by_hour
↪
InvoiceDate
6 4.25
7 28326.65
8 250730.92
9 757407.81
10 1110115.45
...
16 418520.35
17 208414.41
18 94131.78
19 42280.07
20 17505.85
Name: CheckoutPrice, Length: 15, dtype: float64
plot_bar(rev_by_hour, 'Hour', 'Revenue', 'Revenue By Hour')