KEMBAR78
Py spark cheat sheet by cheatsheetmaker.com
PySpark Cheat Sheet
PySpark is an interface for Apache Spark in Python. It not only allows you to write Spark applications using Python APIs
Initializing Spark
SparkContext
>>> from pyspark import SparkContext

>>> sc = SparkContext(master =
'local[2]')

Inspect SparkContext
>>> sc.version #Retrieve
SparkContext version

>>> sc.pythonVer #Retrieve Python
version
>>> sc.master #Master URL to connect
to

>>> str(sc.sparkHome) #Path where
Spark is installed on worker nodes

>>> str(sc.sparkUser()) #Retrieve
name of the Spark User running
SparkContext

>>> sc.appName #Return application
name

>>> sc.applicationId #Retrieve
application ID
>>> sc.defaultParallelism #Return
default level of parallelism
>>> sc.defaultMinPartitions
#Default minimum number of
partitions forRDDs

Configuration
>>> from pyspark import SparkConf,
SparkContext

>>> conf = (SparkConf()

.setMaster( "local" )

.setAppName( "MyApp" )

.set( "spark.executor.memory", "1g"
))

>>> sc = SparkContext(conf = conf)

Using The Shell
In the PySpark shell, a special
interpreter aware SparkContext is
already created in the variable
called sc.

$ ./bin/spark shell --master
local[2]

$ ./bin/pyspark --master local[4] --
py files code.py

Set which master the context
connects to with the --master
argument, and add Python .zip, .egg
or .py files to the

runtime path by passing a comma
separated list to --py-files
Loading Data
Parallelized Collections
>>> rdd = sc.parallelize([('a' ,7),
('a' ,2),('b' ,2)])

>>> rdd2 = sc.parallelize([( 'a',2),
( 'd',1),( 'b',1)])

>>> rdd3 =
sc.parallelize(range(100))

>>> rdd4 = sc.parallelize([( "a",[
"x","y" , "z"]),

( "b",[ "p" ,"r" ])])

External Data
Read either one text file from HDFS,
a local file system or or any Hadoop
supported file system URI with
textFile(),

or read in a directory of text files
with wholeTextFiles()
>>> textFile =
sc.textFile("/my/directory/*.txt" )
>>> textFile2 =
sc.wholeTextFiles("/my/directory/")
Retrieving RDD Information
Basic Information
>>> rdd.getNumPartitions() #List the
number of partitions

>>> rdd.count() #Count RDD instances
3

>>> rdd.countByKey() #Count RDD
instances by key

defaultdict(<type 'int' >, { 'a':2,
'b':1})
>>> rdd.countByValue() #Count RDD
instances by value

defaultdict(<type 'int'>, {( 'b'
,2):1,( 'a' ,2):1,( 'a' ,7):1})

>>> rdd.collectAsMap() #Return
(key,value) pairs as adictionary

{ 'a' : 2, 'b' : 2}

>>> rdd3.sum() #Sum of RDD elements
4950

>>> sc.parallelize([]).isEmpty()
#Check whether RDD is empty

True

Summary
>>> rdd3.max() #Maximum value of RDD
elements

99

>>> rdd3.min() #Minimum value of RDD
elements

0

>>> rdd3.mean() #Mean value of RDD
elements

49.5

>>> rdd3.stdev() #Standard deviation
of RDD elements

28.866070047722118

>>> rdd3.variance() #Compute
variance of RDD elements

833.25
>>> rdd3.histogram(3) #Compute
histogram by bins

([0,33,66,99],[33,33,34])

>>> rdd3.stats() #Summary statistics
(count, mean, stdev, max & min)

Applying Functions
>>> def g(x): print(x)

>>> rdd.foreach(g) #Apply a function
to all RDD elements

( 'a', 7)

( 'b', 2)

( 'a', 2)

Selecting Data
Getting
>>> rdd.collect() #Return a list
with all RDD elements
[('a' , 7), ( 'a', 2), ( 'b', 2)]

>>> rdd.take(2) #Take first 2 RDD
elements

[( 'a', 7), ('a' , 2)]

>>> rdd.first() #Take first RDD
element
( 'a', 7)

>>> rdd.top(2) #Take top 2 RDD
elements

[( 'b', 2), ( 'a', 7)]

Sampling
>>> rdd3.sample(False, 0.15,
81).collect() #Return sampled subset
of rdd3
[3,4,27,31,40,41,42,43,60,76,79,80,86,
Filtering
>>> rdd.filter(lambda x: "a" in
x).collect() #Filter the RDD
[( 'a',7),('a' ,2)]

>>> rdd5.distinct().collect()
#Return distinct RDD values

[ 'a',2, 'b',7]

>>> rdd.keys().collect() #Return
(key,value) RDD's keys

['a' ,'a' , 'b']

Iterating
>>> def g(x): print(x)

>>> rdd.foreach(g) *#Apply a
function to all RDD elements*

( 'a', 7)

( 'b', 2)

( 'a', 2)

Reshaping Data
Reducing
>>> rdd.reduceByKey(lambda x,y :
x+y).collect() #Merge the rdd
values foreach key

[('a',9), ('b',2)]

>>> rdd.reduce(lambda a, b: a + b)
#Merge the rdd values
('a',7,'a',2,'b',2)

Grouping by
>>> rdd3.groupBy(lambda x: x % 2)
.mapValues(list) .collect() #Return
RDD of grouped values
>>>
rdd.groupByKey().mapValues(list).colle
#Group rdd by key

[('a' ,[7,2]), ( 'b',[2])]

Aggregating
>>> seqOp = (lambda x,y:
(x[0]+y,x[1]+1))

>>> combOp = (lambda x,y:
(x[0]+y[0],x[1]+y[1]))

# Aggregate RDD elements of
eachpartition and then the results

>>>
rdd3.aggregate((0,0),seqOp,combOp)
(4950,100) 

#Aggregate values of each RDD key

>>>
rdd.aggregateByKey((0,0),seqop,combop)
Sort
>>> rdd2.sortBy(lambda x:
x[1]).collect() #Sort RDD by given
function 

[('d' ,1), ('b' ,1), ('a',2)]

>>> rdd2.sortByKey().collect()
#Sort (key, value) RDD by key

[('a' ,2), ('b' ,1), ('d' ,1)]

Mathematical Operations
>>> rdd.subtract(rdd2).collect()
#Return each rdd value not contained
in rdd2
[('b' ,2), ('a' ,7)]
#Return each (key,value) pair of
rdd2 with no matching key in rdd

>>>
rdd2.subtractByKey(rdd).collect() 

[( 'b', 1)]

>>> rdd.cartesian(rdd2).collect()
#Return the Cartesian product of rdd
and rdd2

Repartitioning
>>> rdd.repartition(4) #New RDD with
4 partitions

>>> rdd.coalesce(1) #Decrease the
number of partitions in the RDD to 1
cheatsheetmaker.com © 2021
[('a' ,(9,2)), ('b' ​
,(2,1))]

#Aggregate the elements of eachp
artition, and then the results

>>> rdd3.fold(0,add)

4950

# Merge the values for each key

>>> rdd.foldByKey(0, add).collect()
[('a' ,9), ('b' ,2)]
#Create tuples of RDD elements
byapplying a function
>>> rdd3.keyBy(lambda x:
x+x).collect()

Py spark cheat sheet by cheatsheetmaker.com

  • 1.
    PySpark Cheat Sheet PySparkis an interface for Apache Spark in Python. It not only allows you to write Spark applications using Python APIs Initializing Spark SparkContext >>> from pyspark import SparkContext >>> sc = SparkContext(master = 'local[2]') Inspect SparkContext >>> sc.version #Retrieve SparkContext version >>> sc.pythonVer #Retrieve Python version >>> sc.master #Master URL to connect to >>> str(sc.sparkHome) #Path where Spark is installed on worker nodes >>> str(sc.sparkUser()) #Retrieve name of the Spark User running SparkContext >>> sc.appName #Return application name >>> sc.applicationId #Retrieve application ID >>> sc.defaultParallelism #Return default level of parallelism >>> sc.defaultMinPartitions #Default minimum number of partitions forRDDs Configuration >>> from pyspark import SparkConf, SparkContext >>> conf = (SparkConf() .setMaster( "local" ) .setAppName( "MyApp" ) .set( "spark.executor.memory", "1g" )) >>> sc = SparkContext(conf = conf) Using The Shell In the PySpark shell, a special interpreter aware SparkContext is already created in the variable called sc. $ ./bin/spark shell --master local[2] $ ./bin/pyspark --master local[4] -- py files code.py Set which master the context connects to with the --master argument, and add Python .zip, .egg or .py files to the runtime path by passing a comma separated list to --py-files Loading Data Parallelized Collections >>> rdd = sc.parallelize([('a' ,7), ('a' ,2),('b' ,2)]) >>> rdd2 = sc.parallelize([( 'a',2), ( 'd',1),( 'b',1)]) >>> rdd3 = sc.parallelize(range(100)) >>> rdd4 = sc.parallelize([( "a",[ "x","y" , "z"]), ( "b",[ "p" ,"r" ])]) External Data Read either one text file from HDFS, a local file system or or any Hadoop supported file system URI with textFile(), or read in a directory of text files with wholeTextFiles() >>> textFile = sc.textFile("/my/directory/*.txt" ) >>> textFile2 = sc.wholeTextFiles("/my/directory/") Retrieving RDD Information Basic Information >>> rdd.getNumPartitions() #List the number of partitions >>> rdd.count() #Count RDD instances 3 >>> rdd.countByKey() #Count RDD instances by key defaultdict(<type 'int' >, { 'a':2, 'b':1}) >>> rdd.countByValue() #Count RDD instances by value defaultdict(<type 'int'>, {( 'b' ,2):1,( 'a' ,2):1,( 'a' ,7):1}) >>> rdd.collectAsMap() #Return (key,value) pairs as adictionary { 'a' : 2, 'b' : 2} >>> rdd3.sum() #Sum of RDD elements 4950 >>> sc.parallelize([]).isEmpty() #Check whether RDD is empty True Summary >>> rdd3.max() #Maximum value of RDD elements 99 >>> rdd3.min() #Minimum value of RDD elements 0 >>> rdd3.mean() #Mean value of RDD elements 49.5 >>> rdd3.stdev() #Standard deviation of RDD elements 28.866070047722118 >>> rdd3.variance() #Compute variance of RDD elements 833.25 >>> rdd3.histogram(3) #Compute histogram by bins ([0,33,66,99],[33,33,34]) >>> rdd3.stats() #Summary statistics (count, mean, stdev, max & min) Applying Functions >>> def g(x): print(x) >>> rdd.foreach(g) #Apply a function to all RDD elements ( 'a', 7) ( 'b', 2) ( 'a', 2) Selecting Data Getting >>> rdd.collect() #Return a list with all RDD elements [('a' , 7), ( 'a', 2), ( 'b', 2)] >>> rdd.take(2) #Take first 2 RDD elements [( 'a', 7), ('a' , 2)] >>> rdd.first() #Take first RDD element ( 'a', 7) >>> rdd.top(2) #Take top 2 RDD elements [( 'b', 2), ( 'a', 7)] Sampling >>> rdd3.sample(False, 0.15, 81).collect() #Return sampled subset of rdd3 [3,4,27,31,40,41,42,43,60,76,79,80,86, Filtering >>> rdd.filter(lambda x: "a" in x).collect() #Filter the RDD [( 'a',7),('a' ,2)] >>> rdd5.distinct().collect() #Return distinct RDD values [ 'a',2, 'b',7] >>> rdd.keys().collect() #Return (key,value) RDD's keys ['a' ,'a' , 'b'] Iterating >>> def g(x): print(x) >>> rdd.foreach(g) *#Apply a function to all RDD elements* ( 'a', 7) ( 'b', 2) ( 'a', 2) Reshaping Data Reducing >>> rdd.reduceByKey(lambda x,y : x+y).collect() #Merge the rdd values foreach key [('a',9), ('b',2)] >>> rdd.reduce(lambda a, b: a + b) #Merge the rdd values ('a',7,'a',2,'b',2) Grouping by >>> rdd3.groupBy(lambda x: x % 2) .mapValues(list) .collect() #Return RDD of grouped values >>> rdd.groupByKey().mapValues(list).colle #Group rdd by key [('a' ,[7,2]), ( 'b',[2])] Aggregating >>> seqOp = (lambda x,y: (x[0]+y,x[1]+1)) >>> combOp = (lambda x,y: (x[0]+y[0],x[1]+y[1])) # Aggregate RDD elements of eachpartition and then the results >>> rdd3.aggregate((0,0),seqOp,combOp) (4950,100) #Aggregate values of each RDD key >>> rdd.aggregateByKey((0,0),seqop,combop) Sort >>> rdd2.sortBy(lambda x: x[1]).collect() #Sort RDD by given function [('d' ,1), ('b' ,1), ('a',2)] >>> rdd2.sortByKey().collect() #Sort (key, value) RDD by key [('a' ,2), ('b' ,1), ('d' ,1)] Mathematical Operations >>> rdd.subtract(rdd2).collect() #Return each rdd value not contained in rdd2 [('b' ,2), ('a' ,7)] #Return each (key,value) pair of rdd2 with no matching key in rdd >>> rdd2.subtractByKey(rdd).collect() [( 'b', 1)] >>> rdd.cartesian(rdd2).collect() #Return the Cartesian product of rdd and rdd2 Repartitioning >>> rdd.repartition(4) #New RDD with 4 partitions >>> rdd.coalesce(1) #Decrease the number of partitions in the RDD to 1
  • 2.
    cheatsheetmaker.com © 2021 [('a',(9,2)), ('b' ​ ,(2,1))] #Aggregate the elements of eachp artition, and then the results >>> rdd3.fold(0,add) 4950 # Merge the values for each key >>> rdd.foldByKey(0, add).collect() [('a' ,9), ('b' ,2)] #Create tuples of RDD elements byapplying a function >>> rdd3.keyBy(lambda x: x+x).collect()