Big Data
Section 7
By : Yosra Attaher
Agenda
●
NumPy `
●
Pandas
●
Talk about the project
What is NumPy?
●
NumPy is a Python library used for working with arrays.
●
It also has functions for working in domain of linear
algebra, fourier transform, and matrices.
●
NumPy was created in 2005 by Travis Oliphant. It is an
open source project and you can use it freely.
●
NumPy stands for Numerical Python.
Why Use NumPy?
●
In Python we have lists that serve the purpose of arrays,
but they are slow to process.
●
NumPy aims to provide an array object that is up to 50x
faster than traditional Python lists.
●
The array object in NumPy is called ndarray, it provides
a lot of supporting functions that make working with
ndarray very easy.
●
Arrays are very frequently used in data science, where
speed and resources are very important.
NumPy, Arrays
import numpy as np
# create a NumPy ndarray object by using the array() function.
a = np.array(45)
b = np.array([1, 2, 3])
c = np.array((1, 2, 3)) #Use a tuple to create a NumPy array
d = np.array([[1, 2, 3], [3, 4, 5],[2, 3, 4]])
e = np.array((1, 2, 3), ndmin=5)
print(a.ndim, a)
print(a)
Access Arrays
print(b.ndim, b) # 1 [1 2 3]
print(b[0], ', ', b[2]) #1, 3
print(c.ndim, c) # 1 [1 2 3]
print(c[0], ', ', c[1], ', ', c[-1]) #1, 2, 3
print(d.ndim, d) # 2 [[1 2 3] [3 4 5] [2 3 4]]
print(d[0,0], ', ', d[1,2]) #1, 5
print(e.ndim, e) # 5 [[[[[1 2 3]]]]]
print(e[0,0,0,0,2]) #3
Access Arrays
x = np.array([1,2,3,4,5,6,7,8,9,10])
print(x[1:5]) # [2 3 4 5]
print(x[4:]) # [ 5 6 7 8 9 10]
print(x[:4]) # [1 2 3 4]
print(x[-3:-1]) # [8 9]
print(x[1:5:2]) # [2 4]
print(x[::2]) # [1 3 5 7 9]
Slice
x = np.array([[1, 2, 3, 5], [3, 4, 5, 9],[2, 3, 4, 12],[4, 2, 3, 1]])
RUN:
print(x[1, 1:4])
[4 5 9]
print ("--------------------------") --------------------------
print(x[0:2, 2:4]) [[3 5]
[5 9]]
print ("--------------------------")
--------------------------
print(x[1:3, 1:3]) [[4 5]
[3 4]]
Conversion
x = np.array([1.2, 2.4, 3.5], 'f8') RUN...
print(x.dtype); print(x) float64
y = x.astype('i4') [1.2 2.4 3.5]
print(y.dtype); print(y) int32
x = np.array([1.2, 2.4, 3.5], 'i4') [1 2 3]
print(x.dtype); print(x) int32
x = np.array([1.2, 2.4, 3.5], 'i1') [1 2 3]
print(x.dtype); print(x) int8
[1 2 3]
x = np.array(["1.2", "2.4", "3.5"], 'f8')
float64
print(x.dtype); print(x)
[1.2 2.4 3.5]
Copy and View
x = np.array([1,2,3,4,5,6,7,8,9,10]) RUN….
y = x.copy()
[ 1 2 3 4 5 6 7 8 9 10]
x[1] = 12
[ 1 12 3 4 5 6 7 8 9 10]
print(y)
---------------
y = x.view()
x[1] = 12 [ 1 15 3 4 5 6 7 8 9 10]
print(y) None [ 1 15 3 4 5 6 7 8 9 10]
y[1] = 15
print ("---------------")
print(x)
print(x.base, y.base)
Reshape
x = np.array([[1, 2, 3, 5], [3, 4, 5, 9],[2, 3, 4, 12],
[4, 2, 3, 1]]) RUN…...
--------------------
print("--------------------")
(4, 4)
print(x.shape)
[[ 1 2 3 5 3 4 5 9]
y = x.reshape(2, 8) [ 2 3 4 12 4 2 3 1]]
print(y) --------------------
print("--------------------") [ 1 2 3 5 3 4 5 9 2 3 4 12 4 2 3 1]
--------------------
y = x.reshape(-1)
[[ 1 2 3 5]
print(y) [ 3 4 5 9]
print("--------------------") [ 2 3 4 12]
print(y.base) [ 4 2 3 1]]
Join
x = np.array([[1, 1, 1], [2, 2, 2]]) Run…
(4, 3) [[1 1 1]
y = np.array([[3, 3, 3], [4, 4, 4]])
[2 2 2]
z = np.concatenate((x,y)) [3 3 3]
print(z.shape, z) [4 4 4]]
--------------
print("--------------")
(4, 3) [[1 1 1]
z = np.concatenate((x,y), axis=0) [2 2 2]
print(z.shape, z) [3 3 3]
[4 4 4]]
print("--------------")
--------------
z = np.concatenate((x,y), axis=1)
(2, 6) [[1 1 1 3 3 3]
print(z.shape, z) [2 2 2 4 4 4]]
Search, Sort, Filter
#search
x = np.array([11, 31, 87, 19, 23, 43])
y = np.where(x==19); print(y) RUN…
#sort
(array([3]),)
x = np.array([11, 31, 87, 19, 23, 43])
[11 19 23 31 43 87]
y = np.sort(x); print(y)
#filter [11 23 43]
x = np.array([11, 31, 87, 19, 23, 43])
s = [True, False, False, False, True, True]
y = x[s]; print(y)
NUMPY, RANDOM
import numpy as np RUN…
0.7537900893332695
from numpy import random 86
#basics [30 79 10 14 94]
[[ 7 91 46]
x = random.rand(); print(x) [ 0 65 56]
x = random.randint(100); print(x) [62 64 28]
[91 72 18]
x = random.randint(100, size=5); [16 37 24]]
print(x) [0.89308242 0.11235977 0.57879863 0.63562923 0.68296079]
x = random.randint(100, size=(5, 3)); [[0.28630843 0.87333319 0.07027453]
print(x) [0.82643457 0.81043574 0.47318528]
[0.38990336 0.267552 0.23475348]
x = random.rand(5); print(x) [0.28870442 0.82799002 0.85453119]
x = random.rand(5,3); print(x) [0.55594484 0.29363382 0.97318952]]
Random Choice
x = random.choice([5,3,7,8]); RUN…
print(x)
5
x = random.choice([5,3,7,8],
size=(10)); print(x) [7 3 3 5 5 7 5 5 8 5]
x = random.choice([5,3,7,8], [[8 8 3]
size=(2,3)); print(x) [7 8 3]]
x = random.choice([5,3,7,8], [7 7 7 7 3 3 7 7 7 7]
p=[0.1, 0.3, 0.6, 0.0],
size=(10));
print(x)
Shuffel
x = np.array([1,2,3,4,5,6,7,8]) RUN…
o = x.copy() [1 2 3 4 5 6 7 8]
random.shuffle(x) [7 6 3 5 1 4 8 2]
print('\n', o, '\n', x)
x = np.array([1,2,3,4,5,6,7,8]) [1 2 3 4 5 6 7 8]
y = random.permutation(x) [2 4 3 6 5 1 7 8]
print('\n', x, '\n', y)
Random Distribution
import numpy as np
from numpy import random
import matplotlib.pyplot as plt
# We can plot Normal Distribution, Binomial Distribution,
Poisson Distribution, Uniform Distribution, Logarithmic
Distribution, Multinomial Distribution, Exponential
Distribution, Chi-Square Distribution
What is Pandas?
●
Pandas is a Python library used for working with
data sets.
●
It has functions for analyzing, cleaning, exploring,
and manipulating data.
●
The name "Pandas" has a reference to both
"Panel Data", and "Python Data Analysis" and
was created by Wes McKinney in 2008.
Why Use Pandas?
●
Pandas allows us to analyze big data and make
conclusions based on statistical theories.
●
Pandas can clean messy data sets, and make
them readable and relevant.
●
Relevant data is very important in data science.
Series, Creation
import pandas as pd
import numpy as np
#creating series
s = pd.Series([22, 32, 31, 42, 51]); print(s)
data = np.array(['a', 'b', 'c', 'd'])
s = pd.Series(data); print(s)
s = pd.Series(data,
index=[100,101,102,103]); print(s)
Series , Creation
data = {'a':100, 'b':120, 'c':99}
s = pd.Series(data); print(s)
data = {'c':99, 'a':100, 'b':120}
s = pd.Series(data, index=['a', 'b', 'c', 'd']);
print(s)
s = pd.Series(5, index=['a', 'b', 'c', 'd']);
print(s)
Series, Accessing
s = pd.Series([1,2,3,4,5],index =
['a','b','c','d','e'])
print(s[0])
print(s[1:3])
print(s[:3])
print(s[1:])
print(s[:])
print(s[-1])
print(s[-3:-1])
print(s['a'])
print(s[['a', 'c', 'e']])
print(s[[2, 4]])
Series, Basic Functions
calories = {'day1': 200, 'day2': 380,
'day3': 480, 'day4': 290}
s = pd.Series(calories); print(s.axes)
print(s.empty)
print(s.ndim)
print(s.size)
print(s.values)
print(s.head(2))
print(s.tail(2))
DataFrame, Creation
data = [12, 12, 13, 14, 15]
df = pd.DataFrame(data); print(df)
df = pd.DataFrame(data, columns =
['Temprature']);
print(df)
df = pd.DataFrame(data, columns =
['Temprature'],
dtype=float); print(df)
data = [['Alex',10],['Bob',12],['Clarke',13]]
df =
pd.DataFrame(data,columns=['Name','Age
']); print(df)
DataFrame, Creation
data = {
"calories": [200, 380, 480, 290],
"duration": [50, 40, 45, 30]
}
df = pd.DataFrame(data); print(df)
df = pd.DataFrame(data, index=['sat', 'sun',
'mon','tus']); print(df)
df = pd.DataFrame([{'math':88,
'physics':90},{'history':75, 'math':94}]); print(df)
DataFrame, Creation
data = {
'calories': pd.Series([200, 380, 480, 290],
index=['sat', 'sun', 'mon','tus']),
'duration': pd.Series([50, 40, 45, 30], index=['sat',
'sun', 'mon','tus'])
}
df = pd.DataFrame(data); print(df)
df = pd.DataFrame([{'math':88, 'physics':90},{'art':65,
'math':94}], index=['midterm', 'final'],
columns=['physics', 'math', 'art']); print(df)
DataFrame, Basic Functions
data = {
'Name':pd.Series(['Tom','James','Steve','Smith','
Jack']),
'Age':pd.Series([25,26,25,23,30,29,23]),
'Rating':pd.Series([4.23,3.24,3.98,2.56,3.20,4.6,
3.8])};
df = pd.DataFrame(data)
print(df)
print(df.T)
print(df.axes)
print(df.dtypes)
print(df.empty)
print(df.ndim)
print(df.shape)
print(df.size)
print(df.values)
DataFrame, Files
df = pd.read_csv('data/data.csv');
print(df)
df = pd.read_json('data/data.json');
print(df)
print(df.head())
print(df.head(10))
print(df.tail())
print(df.tail(6))
print(df.info())
DataFrame, Cleaning
Df =
pd.read_csv('data/wdata.csv’);
print(df)
print(df.loc[[22, 26,7, 11, 12,
18, 28]])
print(df.info())
DataFrame, Cleaning
dfcopy = df.dropna();
print(dfcopy.info())
df.dropna(inplace = True);
print(df.info())
df = pd.read_csv('data/wdata.csv’)
print(df.loc[[22, 26, 7, 11, 12, 18, 28]])
df.fillna(130, inplace = True);
print(df.info())
print(df.loc[[22, 26, 7, 11, 12, 18, 28]])
DataFrame, Cleaning
df = pd.read_csv('data/wdata.csv')
df.dropna(subset=['Date'], inplace = True)
print(df.info())
df = pd.read_csv('data/wdata.csv')
print(df.duplicated())
df.drop_duplicates(inplace=True)
print(df.duplicated())
DataFrame, Files
df = pd.read_csv('data/data.csv');
print(df)
df = pd.read_json('data/data.json');
print(df)
print(df.head())
print(df.head(10))
print(df.tail())
print(df.tail(6))
print(df.info())
Thanks