NumPy


Array ์ƒ์„ฑ

import numpy as np
 
a = np.array([1, 2, 3])
b = np.array([[1,2,3],[4,5,6]])
c = np.array([2.2, 5, 1.1])
d = np.zeros((2,3))
e = np.ones((2,3))
np.random.rand(2,3)
f = np.arange(10, 50, 2)

Array ์—ฐ์‚ฐ

import numpy as np
 
a = np.array([10,20,30,40])
b = np.array([1, 2, 3,4])
c = a-b
print(c)
d = a*b
print(d)
 
farenheit = np.array([0,-10,-5,-15,0])
celcius = (farenheit - 31) * (5/9)

Pandas


Series ์ž๋ฃŒ๊ตฌ์กฐ

import pandas as pd
 
pd.Series(['Alice', 'Jack', 'Molly'])
pd.Series([1, 2, 3])
pd.Series(['Alice', 'Jack', None])
pd.Series([1, 2, None])
 
students_scores = {'Alice': 'Physics',
                   'Jack': 'Chemistry',
                   'Molly': 'English'}
s = pd.Series(students_scores)
s = pd.Series(['Physics', 'Chemistry', 'English'], index=['Alice', 'Jack', 'Molly'])
 
# iloc ์€ Index ์ˆœ์„œ๋ฅผ ๊ธฐ์ค€
s.iloc[3]
s[3]
 
# loc ์€ ์ง€์ •๋œ Index ๋ฅผ ๊ธฐ์ค€
s.loc['Molly']
 
class_code = {99: 'Physics',
              100: 'Chemistry',
              101: 'English',
              102: 'History'}
s = pd.Series(class_code)
 
# ์ˆซ์ž๋ฅผ Index ๋กœ ์ง€์ •ํ•  ๊ฒฝ์šฐ iloc ์€ ์—๋Ÿฌ
s[0]
 
# NumPy ๋Š” ๋‚ด๋ถ€์ ์œผ๋กœ ๋ณ‘๋ ฌ์ฒ˜๋ฆฌ๊ฐ€ ๊ตฌํ˜„๋˜์–ด์žˆ๊ธฐ ๋•Œ๋ฌธ์— ๋‹จ์ˆœํ•œ iteration ๋ณด๋‹ค ํ›จ์”ฌ ๋น ๋ฅธ ์„ฑ๋Šฅ์„ ๊ฐ€์ง„๋‹ค
s = pd.Series(np.random.randint(0,1000,1000))
np.sum(s)
s+=2

DataFrame ์ž๋ฃŒ๊ตฌ์กฐ

import pandas as pd
 
# Python Dictionary ๋กœ ์ƒ์„ฑ
pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
 
# NumPy 2์ฐจ์› ๋ฐฐ์—ด๋กœ ์ƒ์„ฑ
pd.DataFrame(np.array([[1, 2], [3, 4]]))
 
# Pandas Series ๋กœ ์ƒ์„ฑ
pd.DataFrame({'A': pd.Series([1, 2, 3])})
 
# Group by
df.groupby("cancellation_policy").agg({"review_scores_value":(np.nanmean,np.nanstd), "reviews_per_month":np.nanmean})
 
# Scales
df=pd.DataFrame(['A+', 'A', 'A-', 'B+', 'B', 'B-', 'C+', 'C', 'C-', 'D+', 'D'], index=['excellent', 'excellent', 'excellent', 'good', 'good', 'good', 'ok', 'ok', 'ok', 'poor', 'poor'], columns=["Grades"])
my_categories=pd.CategoricalDtype(categories=['D', 'D+', 'C-', 'C', 'C+', 'B-', 'B', 'B+', 'A-', 'A', 'A+'], ordered=True)
grades=df["Grades"].astype(my_categories)
grades[grades>"C"]

Matplotlib


import matplotlib as mpl
mpl.get_backend()
 
import matplotlib.pyplot as plt
plt.plot?
 
plt.plot(3, 2, '.')
 
plt.figure()
plt.plot(3, 2, 'o')
ax = plt.gca()
ax.axis([0,6,0,10])
 
plt.figure()
plt.plot(1.5, 1.5, 'o')
plt.plot(2, 2, 'o')
plt.plot(2.5, 2.5, 'o')
 
ax = plt.gca()
plt.plot(1.5, 1.5, 'o')
ax.get_children()

Scatter Plots

import matplotlib.pyplot as plt
import numpy as np
 
x = np.array([1,2,3,4,5,6,7,8])
y = x
plt.figure()
plt.scatter(x[:2], y[:2], s=100, c='red', label='Tall students')
plt.scatter(x[2:], y[2:], s=100, c='blue', label='Short students')
plt.xlabel('The number of times the child kicked a ball')
plt.ylabel('The grade of the student')
plt.title('Relationship between ball kicking and grades')
plt.legend(loc=4, frameon=False, title='Legend')

Line Plots

import matplotlib.pyplot as plt
import numpy as np
 
linear_data = np.array([1,2,3,4,5,6,7,8])
exponential_data = linear_data**2
plt.figure(figsize=(8,6))
 
observation_dates = np.arange('2017-01-01', '2017-01-09', dtype='datetime64[D]')
plt.plot(observation_dates, linear_data, '-o',  observation_dates, exponential_data, '-o')
plt.gca().fill_between(range(len(linear_data)), 
                       linear_data, exponential_data, 
                       facecolor='blue', 
                       alpha=0.25)
 
x = plt.gca().xaxis
for item in x.get_ticklabels():
    item.set_rotation(45)
 
ax = plt.gca()
ax.set_xlabel('Date')
ax.set_ylabel('Units')
ax.set_title("Exponential ($x^2$) vs. Linear ($x$) performance")

Bar Charts

import matplotlib.pyplot as plt
import numpy as np
 
linear_data = np.array([1,2,3,4,5,6,7,8])
xvals = range(len(linear_data))
plt.bar(xvals, linear_data, width = 0.3)
 
new_xvals = []
exponential_data = linear_data**2
for item in xvals:
    new_xvals.append(item+0.3)
plt.bar(xvals, linear_data, width = 0.3)
plt.bar(new_xvals, exponential_data, width = 0.3 ,color='red')
 
from random import randint
linear_err = [randint(1,4) for x in range(len(linear_data))]
plt.bar(xvals, linear_data, width = 0.3, yerr=linear_err)
 
xvals = range(len(linear_data))
plt.bar(xvals, linear_data, width = 0.3, color='b')
plt.bar(xvals, exponential_data, width = 0.3, bottom=linear_data, color='r')
 
xvals = range(len(linear_data))
plt.barh(xvals, linear_data, height = 0.3, color='b')
plt.barh(xvals, exponential_data, height = 0.3, left=linear_data, color='r')
import matplotlib.pyplot as plt
import numpy as np
 
plt.figure(figsize=(10, 8));
languages = ['Python', 'SQL', 'Java', 'C++', 'JavaScript']
pos = np.arange(len(languages))
popularity = [56, 39, 34, 34, 29]
 
bars = plt.bar(pos, popularity, align='center', linewidth=0, color='lightslategrey')
bars[0].set_color('#1F77B4')
 
plt.xticks(pos, languages, alpha=0.8)
plt.yticks([])
 
plt.title('Top 5 Languages for Math & Data \nby % popularity on Stack Overflow', alpha=0.8)
for spine in plt.gca().spines.values():
    spine.set_visible(False)
 
for bar in bars:
    height = bar.get_height()
    plt.gca().text(bar.get_x() + bar.get_width() / 2, bar.get_height() - 5, str(int(height)) + '%', ha='center', color='w', fontsize=11)

References