NumPy
Array ์์ฑ
import numpy as np
a = np.array([ 1 , 2 , 3 ])
b = np.array([[ 1 , 2 , 3 ],[ 4 , 5 , 6 ]])
c = np.array([ 2.2 , 5 , 1.1 ])
d = np.zeros(( 2 , 3 ))
e = np.ones(( 2 , 3 ))
np.random.rand( 2 , 3 )
f = np.arange( 10 , 50 , 2 )
Array ์ฐ์ฐ
import numpy as np
a = np.array([ 10 , 20 , 30 , 40 ])
b = np.array([ 1 , 2 , 3 , 4 ])
c = a - b
print (c)
d = a * b
print (d)
farenheit = np.array([ 0 , - 10 , - 5 , - 15 , 0 ])
celcius = (farenheit - 31 ) * ( 5 / 9 )
Pandas
Series ์๋ฃ๊ตฌ์กฐ
import pandas as pd
pd.Series([ 'Alice' , 'Jack' , 'Molly' ])
pd.Series([ 1 , 2 , 3 ])
pd.Series([ 'Alice' , 'Jack' , None ])
pd.Series([ 1 , 2 , None ])
students_scores = { 'Alice' : 'Physics' ,
'Jack' : 'Chemistry' ,
'Molly' : 'English' }
s = pd.Series(students_scores)
s = pd.Series([ 'Physics' , 'Chemistry' , 'English' ], index = [ 'Alice' , 'Jack' , 'Molly' ])
# iloc ์ Index ์์๋ฅผ ๊ธฐ์ค
s.iloc[ 3 ]
s[ 3 ]
# loc ์ ์ง์ ๋ Index ๋ฅผ ๊ธฐ์ค
s.loc[ 'Molly' ]
class_code = { 99 : 'Physics' ,
100 : 'Chemistry' ,
101 : 'English' ,
102 : 'History' }
s = pd.Series(class_code)
# ์ซ์๋ฅผ Index ๋ก ์ง์ ํ ๊ฒฝ์ฐ iloc ์ ์๋ฌ
s[ 0 ]
# NumPy ๋ ๋ด๋ถ์ ์ผ๋ก ๋ณ๋ ฌ์ฒ๋ฆฌ๊ฐ ๊ตฌํ๋์ด์๊ธฐ ๋๋ฌธ์ ๋จ์ํ iteration ๋ณด๋ค ํจ์ฌ ๋น ๋ฅธ ์ฑ๋ฅ์ ๊ฐ์ง๋ค
s = pd.Series(np.random.randint( 0 , 1000 , 1000 ))
np.sum(s)
s += 2
DataFrame ์๋ฃ๊ตฌ์กฐ
import pandas as pd
# Python Dictionary ๋ก ์์ฑ
pd.DataFrame({ 'A' : [ 1 , 2 , 3 ], 'B' : [ 4 , 5 , 6 ]})
# NumPy 2์ฐจ์ ๋ฐฐ์ด๋ก ์์ฑ
pd.DataFrame(np.array([[ 1 , 2 ], [ 3 , 4 ]]))
# Pandas Series ๋ก ์์ฑ
pd.DataFrame({ 'A' : pd.Series([ 1 , 2 , 3 ])})
# Group by
df.groupby( "cancellation_policy" ).agg({ "review_scores_value" :(np.nanmean,np.nanstd), "reviews_per_month" :np.nanmean})
# Scales
df = pd.DataFrame([ 'A+' , 'A' , 'A-' , 'B+' , 'B' , 'B-' , 'C+' , 'C' , 'C-' , 'D+' , 'D' ], index = [ 'excellent' , 'excellent' , 'excellent' , 'good' , 'good' , 'good' , 'ok' , 'ok' , 'ok' , 'poor' , 'poor' ], columns = [ "Grades" ])
my_categories = pd.CategoricalDtype( categories = [ 'D' , 'D+' , 'C-' , 'C' , 'C+' , 'B-' , 'B' , 'B+' , 'A-' , 'A' , 'A+' ], ordered = True )
grades = df[ "Grades" ].astype(my_categories)
grades[grades > "C" ]
Matplotlib
import matplotlib as mpl
mpl.get_backend()
import matplotlib.pyplot as plt
plt.plot ?
plt.plot( 3 , 2 , '.' )
plt.figure()
plt.plot( 3 , 2 , 'o' )
ax = plt.gca()
ax.axis([ 0 , 6 , 0 , 10 ])
plt.figure()
plt.plot( 1.5 , 1.5 , 'o' )
plt.plot( 2 , 2 , 'o' )
plt.plot( 2.5 , 2.5 , 'o' )
ax = plt.gca()
plt.plot( 1.5 , 1.5 , 'o' )
ax.get_children()
Scatter Plots
import matplotlib.pyplot as plt
import numpy as np
x = np.array([ 1 , 2 , 3 , 4 , 5 , 6 , 7 , 8 ])
y = x
plt.figure()
plt.scatter(x[: 2 ], y[: 2 ], s = 100 , c = 'red' , label = 'Tall students' )
plt.scatter(x[ 2 :], y[ 2 :], s = 100 , c = 'blue' , label = 'Short students' )
plt.xlabel( 'The number of times the child kicked a ball' )
plt.ylabel( 'The grade of the student' )
plt.title( 'Relationship between ball kicking and grades' )
plt.legend( loc = 4 , frameon = False , title = 'Legend' )
Line Plots
import matplotlib.pyplot as plt
import numpy as np
linear_data = np.array([ 1 , 2 , 3 , 4 , 5 , 6 , 7 , 8 ])
exponential_data = linear_data ** 2
plt.figure( figsize = ( 8 , 6 ))
observation_dates = np.arange( '2017-01-01' , '2017-01-09' , dtype = 'datetime64[D]' )
plt.plot(observation_dates, linear_data, '-o' , observation_dates, exponential_data, '-o' )
plt.gca().fill_between( range ( len (linear_data)),
linear_data, exponential_data,
facecolor = 'blue' ,
alpha = 0.25 )
x = plt.gca().xaxis
for item in x.get_ticklabels():
item.set_rotation( 45 )
ax = plt.gca()
ax.set_xlabel( 'Date' )
ax.set_ylabel( 'Units' )
ax.set_title( "Exponential ($x^2$) vs. Linear ($x$) performance" )
Bar Charts
import matplotlib.pyplot as plt
import numpy as np
linear_data = np.array([ 1 , 2 , 3 , 4 , 5 , 6 , 7 , 8 ])
xvals = range ( len (linear_data))
plt.bar(xvals, linear_data, width = 0.3 )
new_xvals = []
exponential_data = linear_data ** 2
for item in xvals:
new_xvals.append(item + 0.3 )
plt.bar(xvals, linear_data, width = 0.3 )
plt.bar(new_xvals, exponential_data, width = 0.3 , color = 'red' )
from random import randint
linear_err = [randint( 1 , 4 ) for x in range ( len (linear_data))]
plt.bar(xvals, linear_data, width = 0.3 , yerr = linear_err)
xvals = range ( len (linear_data))
plt.bar(xvals, linear_data, width = 0.3 , color = 'b' )
plt.bar(xvals, exponential_data, width = 0.3 , bottom = linear_data, color = 'r' )
xvals = range ( len (linear_data))
plt.barh(xvals, linear_data, height = 0.3 , color = 'b' )
plt.barh(xvals, exponential_data, height = 0.3 , left = linear_data, color = 'r' )
import matplotlib.pyplot as plt
import numpy as np
plt.figure( figsize = ( 10 , 8 )) ;
languages = [ 'Python' , 'SQL' , 'Java' , 'C++' , 'JavaScript' ]
pos = np.arange( len (languages))
popularity = [ 56 , 39 , 34 , 34 , 29 ]
bars = plt.bar(pos, popularity, align = 'center' , linewidth = 0 , color = 'lightslategrey' )
bars[ 0 ].set_color( '#1F77B4' )
plt.xticks(pos, languages, alpha = 0.8 )
plt.yticks([])
plt.title( 'Top 5 Languages for Math & Data \n by % popularity on Stack Overflow' , alpha = 0.8 )
for spine in plt.gca().spines.values():
spine.set_visible( False )
for bar in bars:
height = bar.get_height()
plt.gca().text(bar.get_x() + bar.get_width() / 2 , bar.get_height() - 5 , str ( int (height)) + '%' , ha = 'center' , color = 'w' , fontsize = 11 )
References