Numpy Basics: Arithmetic Operations
Numpy Basics: Arithmetic Operations
                                                    data	set:	
                                                                                                                                                      other	format	works	as	intui7vely	with	pandas.	
h.p://pandas.pydata.org	                                         Each	variable	is	saved	
                                                                 in	its	own	column	
                                                                                                                   Each	observa8on	is	
                                                                                                                   saved	in	its	own	row	
                                                                                                                                                                                                                                       M	*	A	
 Syntax	–	Crea7ng	DataFrames	                                                                          Reshaping	Data	–	Change	the	layout	of	a	data	set	
                     a	        b	        c	                                                                                                                                                             df.sort_values('mpg')	
           1	        4	        7	        10	                                                                                                                                                             Order	rows	by	values	of	a	column	(low	to	high).	
           2	        5	        8	        11	                                                                                                                                                            	
           3	        6	        9	        12	
                                                                                                                                                                                                        df.sort_values('mpg',ascending=False)	
                                                                                                                                                                                                         Order	rows	by	values	of	a	column	(high	to	low).	
  df	=	pd.DataFrame(	                                                                                                                                                                                   	
  										{"a"	:	[4	,5,	6],		                           pd.melt(df)	                                                        df.pivot(columns='var',	values='val')	 df.rename(columns	=	{'y':'year'})	
  											"b"	:	[7,	8,	9],		                           		Gather	columns	into	rows.	                                        		Spread	rows	into	columns.	            Rename	the	columns	of	a	DataFrame	
                                                                                                                                                                                                        	
  											"c"	:	[10,	11,	12]},					
  								index	=	[1,	2,	3])	                                                                                                                                                                           df.sort_index()	
                                                                                                                                                                                                         Sort	the	index	of	a	DataFrame	
  		Specify	values	for	each	column.	                                                                                                                                                                    	
  	
                                                                                                                                                                                                        df.reset_index()	
  df	=	pd.DataFrame(	
  					[[4,	7,	10],	                                                                                                                                                                                     Reset	index	of	DataFrame	to	row	numbers,	moving	
  						[5,	8,	11],	                                                                                                                                                                                     index	to	columns.	
                                                                                                                                                                                                            	
  						[6,	9,	12]],		                                     pd.concat([df1,df2])	                                              pd.concat([df1,df2],	axis=1)	                                             df.drop(['Length','Height'],	axis=1)	
  					index=[1,	2,	3],		                                  		Append	rows	of	DataFrames	                                       		Append	columns	of	DataFrames	                                           					Drop	columns	from	DataFrame	
  					columns=['a',	'b',	'c'])	
  		Specify	values	for	each	row.	
           n	   v	
                          a	        b	   c	
                                                            Subset	Observa8ons	(Rows)	                                                                                             Subset	Variables	(Columns)	
                1	        4	        7	   10	
           d	
                2	        5	        8	   11	
           e	   2	        6	        9	   12	
  df	=	pd.DataFrame(	                                                                                                                                                     df[['width','length','species']]	
                                                       df[df.Length	>	7]	                                    df.sample(frac=0.5)	                                         					Select	mul7ple	columns	with	specific	names.	
  										{"a"	:	[4	,5,	6],		
                                                         Extract	rows	that	meet	logical	                          Randomly	select	frac7on	of	rows.		                      df['width']		or		df.width	
  											"b"	:	[7,	8,	9],		
                                                         criteria.	                                          df.sample(n=10)	
  											"c"	:	[10,	11,	12]},				                                                                                                                                     					Select	single	column	with	specific	name.	
                                                       df.drop_duplicates()	                                 					Randomly	select	n	rows.	                                df.filter(regex='regex')	
  index	=	pd.MultiIndex.from_tuples(	
                                                         Remove	duplicate	rows	(only	                        df.iloc[10:20]	
  										[('d',1),('d',2),('e',2)],	                                                                                                                                   					Select	columns	whose	name	matches	regular	expression	regex.	
                                                         considers	columns).	                                					Select	rows	by	posi7on.	
  													names=['n','v'])))	
                                                       df.head(n)	                                           df.nlargest(n,	'value')	                                                           regex	(Regular	Expressions)	Examples	
  		Create	DataFrame	with	a	Mul7Index	
                                                         Select	first	n	rows.	                                					Select	and	order	top	n	entries.	                       '\.'	                              Matches	strings	containing	a	period	'.'	
                                                       df.tail(n)	                                           df.nsmallest(n,	'value')	
      Method	Chaining	
                                                                                                                                                                         'Length$'	                         Matches	strings	ending	with	word	'Length'	
                                                         Select	last	n	rows.	                                     Select	and	order	bo.om	n	entries.	                     '^Sepal'	                          Matches	strings	beginning	with	the	word	'Sepal'	
 Most	pandas	methods	return	a	DataFrame	so	that	                                                                                                                         '^x[1-5]$'	                        Matches	strings	beginning	with	'x'	and	ending	with	1,2,3,4,5	
 another	pandas	method	can	be	applied	to	the	                                          Logic	in	Python	(and	pandas)	                                                     ''^(?!Species$).*'	                Matches	strings	except	the	string	'Species'	
 result.		This	improves	readability	of	code.	         <	 Less	than	                   !=	                                      Not	equal	to	
 df	=	(pd.melt(df)	                                                                                                                                                       df.loc[:,'x2':'x4']	
 								.rename(columns={	                           >	 Greater	than	                df.column.isin(values)	                  Group	membership	                          					Select	all	columns	between	x2	and	x4	(inclusive).	
 																'variable'	:	'var',			               ==	 Equals	                     pd.isnull(obj)	                          Is	NaN	                                    df.iloc[:,[1,2,5]]	
 																'value'	:	'val'})	                   <=	 Less	than	or	equals	        pd.notnull(obj)	                         Is	not	NaN	
                                                                                                                                                                          					Select	columns	in	posi7ons	1,	2	and	5	(first	column	is	0).	
 								.query('val	>=	200')	                                                                                                                                            df.loc[df['a']	>	10,	['a','c']]	
                                                      >=	 Greater	than	or	equals	 &,|,~,^,df.any(),df.all()	                   Logical	and,	or,	not,	xor,	any,	all	
 					)	                                                                                                                                                                  					Select	rows	mee7ng	logical	condi7on,	and	only	the	specific	columns	.	
                                                   h.p://pandas.pydata.org/		This	cheat	sheet	inspired	by	Rstudio	Data	Wrangling	Cheatsheet	(h.ps://www.rstudio.com/wp-content/uploads/2015/02/data-wrangling-cheatsheet.pdf)		Wri.en	by	Irv	Lus7g,	Princeton	Consultants	
                      Summarize	Data	                                                                               Handling	Missing	Data	                                                                                           Combine	Data	Sets	
 df['w'].value_counts()	                                                                                  df.dropna()	                                                                                                                adf	                    bdf	
 					Count	number	of	rows	with	each	unique	value	of	variable	                                            					Drop	rows	with	any	column	having	NA/null	data.	                                                                        x1	   x2	              x1	   x3	
 len(df)	                                                                                                 df.fillna(value)	                                                                                                           A	    1	               A	     T	
    #	of	rows	in	DataFrame.	                                                                                 Replace	all	NA/null	data	with	value.	                                                                                    B	    2	               B	     F	
 df['w'].nunique()	                                                                                                                                                                                                                   C	    3	               D	     T	
    #	of	dis7nct	values	in	a	column.	
 df.describe()	                                                                                                         Make	New	Columns	                                                                                Standard	Joins	
    Basic	descrip7ve	sta7s7cs	for	each	column	(or	GroupBy)	                                                                                                                                                              x1	 x2	 x3	 pd.merge(adf,	bdf,	
                                                                                                                                                                                                                         A	 1	 T	 									how='left',	on='x1')	
                                                                                                                                                                                                                         B	 2	 F	      Join	matching	rows	from	bdf	to	adf.	
                                                                                                                                                                                                                         C	 3	 NaN	 	
                                                                                                          df.assign(Area=lambda	df:	df.Length*df.Height)	                                                                            	
 pandas	provides	a	large	set	of	summary	func8ons	that	operate	on	                                         					Compute	and	append	one	or	more	new	columns.	                                                              x1	 x2	 x3	 pd.merge(adf,	bdf,	
 different	kinds	of	pandas	objects	(DataFrame	columns,	Series,	                                            df['Volume']	=	df.Length*df.Height*df.Depth	                                                                   A	 1.0	 T	 									how='right',	on='x1')	
 GroupBy,	Expanding	and	Rolling	(see	below))	and	produce	single	                                          					Add	single	column.	                                                                                       B	 2.0	 F	    Join	matching	rows	from	adf	to	bdf.	
 values	for	each	of	the	groups.	When	applied	to	a	DataFrame,	the	                                         pd.qcut(df.col,	n,	labels=False)	                                                                              D	 NaN	 T	 	
 result	is	returned	as	a	pandas	Series	for	each	column.	Examples:	                                           Bin	column	into	n	buckets.	                                                                                             	
                                                                                                                                                                                                                         x1	 x2	 x3	 pd.merge(adf,	bdf,	
 sum()	                                           min()	
                                                                                                                                                                                                                          A	 1	 T	 									how='inner',	on='x1')	
  Sum	values	of	each	object.	                      Minimum	value	in	each	object.	                                              Vector	                                            Vector	                                 B	 2	 F	     Join	data.	Retain	only	rows	in	both	sets.	
 count()	                                         max()	                                                                      func8on	                                           func8on	
  Count	non-NA/null	values	of	                     Maximum	value	in	each	object.	                                                                                                                                                    	
  each	object.	                                   mean()	                                                                                                                                                                x1	 x2	 x3	 pd.merge(adf,	bdf,	
 median()	                                         Mean	value	of	each	object.	                            pandas	provides	a	large	set	of	vector	func8ons	that	operate	on	all	                                             A	 1	 T	 									how='outer',	on='x1')	
  Median	value	of	each	object.	                   var()	                                                  columns	of	a	DataFrame	or	a	single	selected	column	(a	pandas	                                                   B	 2	 F	     Join	data.	Retain	all	values,	all	rows.	
 quantile([0.25,0.75])	                            Variance	of	each	object.	                              Series).	These	func7ons	produce	vectors	of	values	for	each	of	the	                                              C	 3	 NaN	
  Quan7les	of	each	object.	                       std()	                                                  columns,	or	a	single	Series	for	the	individual	Series.	Examples:	                                               D	 NaN	 T	
 apply(function)	                                  Standard	devia7on	of	each	                             max(axis=1)	                       min(axis=1)	                                                                Filtering	Joins	
  Apply	func7on	to	each	object.	                   object.	                                                 Element-wise	max.	                 Element-wise	min.	                                                        x1	 x2	          adf[adf.x1.isin(bdf.x1)]	
                                                                                                          clip(lower=-10,upper=10)	abs()	                                                                                 A	 1	             All	rows	in	adf	that	have	a	match	in	bdf.	
                              Group	Data	                                                                   Trim	values	at	input	thresholds	 Absolute	value.	                                                             B	 2	           	
                                                                                                                                                                                                                                          	
                                                  df.groupby(by="col")	                                   The	examples	below	can	also	be	applied	to	groups.	In	this	case,	the	                                           x1	 x2	          adf[~adf.x1.isin(bdf.x1)]	
                                                    Return	a	GroupBy	object,	                             func7on	is	applied	on	a	per-group	basis,	and	the	returned	vectors	                                              C	 3	             All	rows	in	adf	that	do	not	have	a	match	in	bdf.	
                                                    grouped	by	values	in	column	                          are	of	the	length	of	the	original	DataFrame.	
                                                    named	"col".	                                        shift(1)	                                            shift(-1)	                                                              ydf	                   zdf	
                                                  	                                                       Copy	with	values	shihed	by	1.	                       Copy	with	values	lagged	by	1.	                                        x1	   x2	              x1	   x2	
                                                  df.groupby(level="ind")	                               rank(method='dense')	                                cumsum()	                                                              A	    1	               B	    2	
                                                    Return	a	GroupBy	object,	                             Ranks	with	no	gaps.	                                 Cumula7ve	sum.	                                                       B	    2	               C	    3	
                                                    grouped	by	values	in	index	                          rank(method='min')	                                  cummax()	                                                              C	    3	               D	    4	
                                                    level	named	"ind".	                                   Ranks.	Ties	get	min	rank.	                           Cumula7ve	max.	
                                                                                                                                                                                                                         Set-like	Opera7ons	
All	of	the	summary	func7ons	listed	above	can	be	applied	to	a	group.	                                     rank(pct=True)	                                      cummin()	
Addi7onal	GroupBy	func7ons:	                                                                              Ranks	rescaled	to	interval	[0,	1].	                  Cumula7ve	min.	                                           x1	 x2	            pd.merge(ydf,	zdf)	
size()	                           agg(function)	                                                         rank(method='first')	                                cumprod()	                                                 B	 2	                Rows	that	appear	in	both	ydf	and	zdf	
  Size	of	each	group.	              Aggregate	group	using	func7on.	                                       Ranks.	Ties	go	to	first	value.	                       Cumula7ve	product.	                                       C	 3	                (Intersec7on).	
                                                                                                                                                                                                                                            	
                                                                                                                                                                                                                         x1	   x2	
                                 Windows	                                                                                                    PloUng	                                                                     A	
                                                                                                                                                                                                                         B	
                                                                                                                                                                                                                               1	
                                                                                                                                                                                                                               2	
                                                                                                                                                                                                                                            pd.merge(ydf,	zdf,	how='outer')	
                                                                                                                                                                                                                                              Rows	that	appear	in	either	or	both	ydf	and	zdf	
                                                                                                                                                                                                                                              (Union).	
 df.expanding()	                                                                                          df.plot.hist()	                              df.plot.scatter(x='w',y='h')	                                     C	    3	             	
  Return	an	Expanding	object	allowing	summary	func7ons	to	be	                                              Histogram	for	each	column	                   Sca.er	chart	using	pairs	of	points	                              D	    4	           pd.merge(ydf,	zdf,	how='outer',		
  applied	cumula7vely.	                                                                                                                                                                                                                     									indicator=True)	
 df.rolling(n)	                                                                                                                                                                                                          x1	 x2	
                                                                                                                                                                                                                         A	 1	              .query('_merge	==	"left_only"')	
  Return	a	Rolling	object	allowing	summary	func7ons	to	be	                                                                                                                                                                                  .drop(['_merge'],axis=1)	
  applied	to	windows	of	length	n.	                                                                                                                                                                                                            Rows	that	appear	in	ydf	but	not	zdf	(Setdiff).	
h.p://pandas.pydata.org/		This	cheat	sheet	inspired	by	Rstudio	Data	Wrangling	Cheatsheet	(h.ps://www.rstudio.com/wp-content/uploads/2015/02/data-wrangling-cheatsheet.pdf)	Wri.en	by	Irv	Lus7g,	Princeton	Consultants	
Python For Data Science Cheat Sheet                                               Asking For Help                                                            Dropping
                                                                                  >>> help(pd.Series.loc)
                                                                                                                                                             >>> s.drop(['a', 'c'])                        Drop values from rows (axis=0)
                           Pandas Basics                                          Selection                                      Also see NumPy Arrays       >>> df.drop('Country', axis=1) Drop values from columns(axis=1)
      Learn Python for Data Science Interactively at www.DataCamp.com
                                                                                   Getting
                                                                                   >>> s['b']                              Get one element                   Sort & Rank
                                                                                     -5
Pandas                                                                                                                                                       >>> df.sort_index()              Sort by labels along an axis
                                                                                                                                                             >>> df.sort_values(by='Country') Sort by the values along an axis
                                                                                   >>> df[1:]                              Get subset of a DataFrame
The Pandas library is built on NumPy and provides easy-to-use                           Country     Capital   Population                                     >>> df.rank()                    Assign ranks to entries
data structures and data analysis tools for the Python                              1     India   New Delhi   1303171035
                                                                                    2    Brazil    Brasília   207847528
programming language.                                                                                                                                        Retrieving Series/DataFrame Information
                                                                                   Selecting, Boolean Indexing & Setting                                        Basic Information
Use the following import convention:                                               By Position                                                                 >>>   df.shape                        (rows,columns)
   >>> import pandas as pd                                                        >>> df.iloc([0],[0])                        Select single value by row &     >>>   df.index	                       Describe index	
                                                                                    'Belgium'                                 column                           >>>   df.columns                      Describe DataFrame columns
Pandas Data Structures                                                            >>> df.iat([0],[0])
                                                                                                                                                               >>>
                                                                                                                                                               >>>
                                                                                                                                                                     df.info()
                                                                                                                                                                     df.count()
                                                                                                                                                                                                     Info on DataFrame
                                                                                                                                                                                                     Number of non-NA values
 Series                                                                             'Belgium'
                                                                                                                                                               Summary
   A one-dimensional labeled array                               a   3             By Label
                                                                                  >>> df.loc([0], ['Country'])                Select single value by row &     >>>   df.sum()                              Sum of values
   capable of holding any data type                              b   -5
                                                                                    'Belgium'                                 column labels                    >>>   df.cumsum()                           Cummulative sum of values
                                                                                                                                                               >>>   df.min()/df.max()                     Minimum/maximum values
                                                                 c   7            >>> df.at([0], ['Country'])                                                  >>>   df.idxmin()/df.idxmax()
                                                       Index                                                                                                                                               Minimum/Maximum index value
                                                                 d   4              'Belgium'                                                                  >>>   df.describe()                         Summary statistics
                                                                                                                                                               >>>   df.mean()                             Mean of values
>>> s = pd.Series([3, -5, 7, 4], index=['a', 'b', 'c', 'd'])
                                                                                   By Label/Position                                                           >>>   df.median()                           Median of values
                                                                                  >>> df.ix[2]                                Select single row of
 DataFrame                                                                          Country
                                                                                    Capital
                                                                                                 Brazil
                                                                                               Brasília
                                                                                                                              subset of rows                 Applying Functions
                                                                                    Population 207847528                                                      >>> f = lambda x: x*2
Columns
              Country    Capital    Population   A two-dimensional labeled        >>> df.ix[:,'Capital']                      Select a single column of       >>> df.apply(f)                       Apply function
                                                                                                                                                              >>> df.applymap(f)                    Apply function element-wise
                                                 data structure with columns        0      Brussels                           subset of columns
          0   Belgium    Brussels    11190846                                       1     New Delhi
                                                 of potentially different types     2      Brasília                                                          Data Alignment
          1    India    New Delhi 1303171035
Index                                                                             >>> df.ix[1,'Capital']                      Select rows and columns
          2    Brazil    Brasília   207847528                                                                                                                 Internal Data Alignment
                                                                                    'New Delhi'
                                                                                                                                                             NA values are introduced in the indices that don’t overlap:
                                                                                   Boolean Indexing
>>> data = {'Country': ['Belgium', 'India', 'Brazil'],                                                                                                        >>> s3 = pd.Series([7, -2, 3], index=['a', 'c', 'd'])
                                                                                  >>> s[~(s > 1)]                     Series s where value is not >1
              'Capital': ['Brussels', 'New Delhi', 'Brasília'],                   >>> s[(s < -1) | (s > 2)]           s where value is <-1 or >2              >>> s + s3
              'Population': [11190846, 1303171035, 207847528]}                    >>> df[df['Population']>1200000000] Use filter to adjust DataFrame            a      10.0
                                                                                                                                                                b      NaN
>>> df = pd.DataFrame(data,                                                        Setting
                                                                                                                                                                c      5.0
                            columns=['Country', 'Capital', 'Population'])         >>> s['a'] = 6                              Set index a of Series s to 6
                                                                                                                                                                d      7.0