Introduction to Data Science

NumPy

Array 생성

import numpy as np
 
a = np.array([1, 2, 3])
b = np.array([[1,2,3],[4,5,6]])
c = np.array([2.2, 5, 1.1])
d = np.zeros((2,3))
e = np.ones((2,3))
np.random.rand(2,3)
f = np.arange(10, 50, 2)

Array 연산

import numpy as np
 
a = np.array([10,20,30,40])
b = np.array([1, 2, 3,4])
c = a-b
print(c)
d = a*b
print(d)
 
farenheit = np.array([0,-10,-5,-15,0])
celcius = (farenheit - 31) * (5/9)

Pandas

Series 자료구조

import pandas as pd
 
pd.Series(['Alice', 'Jack', 'Molly'])
pd.Series([1, 2, 3])
pd.Series(['Alice', 'Jack', None])
pd.Series([1, 2, None])
 
students_scores = {'Alice': 'Physics',
                   'Jack': 'Chemistry',
                   'Molly': 'English'}
s = pd.Series(students_scores)
s = pd.Series(['Physics', 'Chemistry', 'English'], index=['Alice', 'Jack', 'Molly'])
 
# iloc 은 Index 순서를 기준
s.iloc[3]
s[3]
 
# loc 은 지정된 Index 를 기준
s.loc['Molly']
 
class_code = {99: 'Physics',
              100: 'Chemistry',
              101: 'English',
              102: 'History'}
s = pd.Series(class_code)
 
# 숫자를 Index 로 지정할 경우 iloc 은 에러
s[0]
 
# NumPy 는 내부적으로 병렬처리가 구현되어있기 때문에 단순한 iteration 보다 훨씬 빠른 성능을 가진다
s = pd.Series(np.random.randint(0,1000,1000))
np.sum(s)
s+=2

DataFrame 자료구조

import pandas as pd
 
# Python Dictionary 로 생성
pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
 
# NumPy 2차원 배열로 생성
pd.DataFrame(np.array([[1, 2], [3, 4]]))
 
# Pandas Series 로 생성
pd.DataFrame({'A': pd.Series([1, 2, 3])})
 
# Group by
df.groupby("cancellation_policy").agg({"review_scores_value":(np.nanmean,np.nanstd), "reviews_per_month":np.nanmean})
 
# Scales
df=pd.DataFrame(['A+', 'A', 'A-', 'B+', 'B', 'B-', 'C+', 'C', 'C-', 'D+', 'D'], index=['excellent', 'excellent', 'excellent', 'good', 'good', 'good', 'ok', 'ok', 'ok', 'poor', 'poor'], columns=["Grades"])
my_categories=pd.CategoricalDtype(categories=['D', 'D+', 'C-', 'C', 'C+', 'B-', 'B', 'B+', 'A-', 'A', 'A+'], ordered=True)
grades=df["Grades"].astype(my_categories)
grades[grades>"C"]

Matplotlib

import matplotlib as mpl
mpl.get_backend()
 
import matplotlib.pyplot as plt
plt.plot?
 
plt.plot(3, 2, '.')
 
plt.figure()
plt.plot(3, 2, 'o')
ax = plt.gca()
ax.axis([0,6,0,10])
 
plt.figure()
plt.plot(1.5, 1.5, 'o')
plt.plot(2, 2, 'o')
plt.plot(2.5, 2.5, 'o')
 
ax = plt.gca()
plt.plot(1.5, 1.5, 'o')
ax.get_children()

Scatter Plots

import matplotlib.pyplot as plt
import numpy as np
 
x = np.array([1,2,3,4,5,6,7,8])
y = x
plt.figure()
plt.scatter(x[:2], y[:2], s=100, c='red', label='Tall students')
plt.scatter(x[2:], y[2:], s=100, c='blue', label='Short students')
plt.xlabel('The number of times the child kicked a ball')
plt.ylabel('The grade of the student')
plt.title('Relationship between ball kicking and grades')
plt.legend(loc=4, frameon=False, title='Legend')

Line Plots

import matplotlib.pyplot as plt
import numpy as np
 
linear_data = np.array([1,2,3,4,5,6,7,8])
exponential_data = linear_data**2
plt.figure(figsize=(8,6))
 
observation_dates = np.arange('2017-01-01', '2017-01-09', dtype='datetime64[D]')
plt.plot(observation_dates, linear_data, '-o',  observation_dates, exponential_data, '-o')
plt.gca().fill_between(range(len(linear_data)), 
                       linear_data, exponential_data, 
                       facecolor='blue', 
                       alpha=0.25)
 
x = plt.gca().xaxis
for item in x.get_ticklabels():
    item.set_rotation(45)
 
ax = plt.gca()
ax.set_xlabel('Date')
ax.set_ylabel('Units')
ax.set_title("Exponential ($x^2$) vs. Linear ($x$) performance")

Bar Charts

import matplotlib.pyplot as plt
import numpy as np
 
linear_data = np.array([1,2,3,4,5,6,7,8])
xvals = range(len(linear_data))
plt.bar(xvals, linear_data, width = 0.3)
 
new_xvals = []
exponential_data = linear_data**2
for item in xvals:
    new_xvals.append(item+0.3)
plt.bar(xvals, linear_data, width = 0.3)
plt.bar(new_xvals, exponential_data, width = 0.3 ,color='red')
 
from random import randint
linear_err = [randint(1,4) for x in range(len(linear_data))]
plt.bar(xvals, linear_data, width = 0.3, yerr=linear_err)
 
xvals = range(len(linear_data))
plt.bar(xvals, linear_data, width = 0.3, color='b')
plt.bar(xvals, exponential_data, width = 0.3, bottom=linear_data, color='r')
 
xvals = range(len(linear_data))
plt.barh(xvals, linear_data, height = 0.3, color='b')
plt.barh(xvals, exponential_data, height = 0.3, left=linear_data, color='r')

import matplotlib.pyplot as plt
import numpy as np
 
plt.figure(figsize=(10, 8));
languages = ['Python', 'SQL', 'Java', 'C++', 'JavaScript']
pos = np.arange(len(languages))
popularity = [56, 39, 34, 34, 29]
 
bars = plt.bar(pos, popularity, align='center', linewidth=0, color='lightslategrey')
bars[0].set_color('#1F77B4')
 
plt.xticks(pos, languages, alpha=0.8)
plt.yticks([])
 
plt.title('Top 5 Languages for Math & Data \nby % popularity on Stack Overflow', alpha=0.8)
for spine in plt.gca().spines.values():
    spine.set_visible(False)
 
for bar in bars:
    height = bar.get_height()
    plt.gca().text(bar.get_x() + bar.get_width() / 2, bar.get_height() - 5, str(int(height)) + '%', ha='center', color='w', fontsize=11)

References

Coursera - Applied Data Science with Python Specialization

meatsby.github.io

Explorer

Introduction to Data Science

NumPy

Array 생성

Array 연산

Pandas

Series 자료구조

DataFrame 자료구조

Matplotlib

Scatter Plots

Line Plots

Bar Charts

References

Graph View

Table of Contents