SQL and PySpark
Select Columns
SQL PySpark
SELECT column1, column2 FROM table; df.select("column1", "column2")
Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Filter Rows
SQL PySpark
SELECT * FROM table WHERE condition; df.filter("condition")
Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Aggregate Functions
SQL PySpark
SELECT AVG(column) FROM table; df.select(F.avg("column"))
Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Group By
SQL PySpark
SELECT column, COUNT(*) FROM table df.groupBy("column").count()
GROUP BY column;
Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Order By
SQL PySpark
SELECT * FROM table ORDER BY column df.orderBy("column", ascending=True)
ASC;
Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Join
SQL PySpark
SELECT * FROM table1 JOIN table2 ON df1.join(df2, df1.id == df2.id)
table1.id = table2.id;
Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Union
SQL PySpark
SELECT * FROM table1 UNION SELECT * df1.union(df2)
FROM table2;
Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Limit
SQL PySpark
SELECT * FROM table LIMIT 100; df.limit(100)
Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Distinct Values
SQL PySpark
SELECT DISTINCT column FROM table; df.select("column").distinct()
Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Adding a New Column
SQL PySpark
SELECT *, (column1 + column2) AS df.withColumn("new_column", F.col("column1") +
new_column FROM table; F.col("column2"))
Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Column Alias
SQL PySpark
SELECT column AS alias_name FROM table; df.select(F.col("column").alias("alias_name"))
Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Filtering on Multiple Conditions
SQL PySpark
SELECT * FROM table WHERE condition1 df.filter((F.col("condition1")) &
AND condition2; (F.col("condition2")))
Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Subquery
SQL PySpark
SELECT * FROM (SELECT * FROM table df.filter("condition").alias("subquery")
WHERE condition) AS subquery;
Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Between
SQL PySpark
SELECT * FROM table WHERE column df.filter(F.col("column").between("val1", "val2"))
BETWEEN val1 AND val2;
Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Like
SQL PySpark
SELECT * FROM table WHERE column LIKE df.filter(F.col("column").like("pattern"))
pattern;
Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Case When
SQL PySpark
SELECT CASE WHEN condition THEN result1 df.select(F.when(F.col("condition"),
ELSE result2 END FROM table; "result1").otherwise("result2"))
Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Cast Data Type
SQL PySpark
SELECT CAST(column AS datatype) FROM df.select(F.col("column").cast("datatype"))
table;
Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Count Distinct
SQL PySpark
SELECT COUNT(DISTINCT column) FROM df.select(F.countDistinct("column"))
table;
Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Substring
SQL PySpark
SELECT SUBSTRING(column, start, length) df.select(F.substring("column", start, length))
FROM table;
Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Concatenate Columns
SQL PySpark
SELECT CONCAT(column1, column2) AS df.withColumn("new_column",
new_column FROM table; F.concat(F.col("column1"), F.col("column2")))
Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Average Over Partition
SQL PySpark
SELECT AVG(column) OVER (PARTITION BY df.withColumn("avg",
column2) FROM table; F.avg("column").over(Window.partitionBy("colum
n2")))
Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Sum Over Partition
SQL PySpark
SELECT SUM(column) OVER (PARTITION BY df.withColumn("sum",
column2) FROM table; F.sum("column").over(Window.partitionBy("colum
n2")))
Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Lead Function
SQL PySpark
SELECT LEAD(column, 1) OVER (ORDER BY df.withColumn("lead", F.lead("column",
column2) FROM table; 1).over(Window.orderBy("column2")))
Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Lag Function
SQL PySpark
SELECT LAG(column, 1) OVER (ORDER BY df.withColumn("lag", F.lag("column",
column2) FROM table; 1).over(Window.orderBy("column2")))
Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Row Count
SQL PySpark
SELECT COUNT(*) FROM table; df.count()
Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Drop Column
SQL PySpark
ALTER TABLE table DROP COLUMN column; df.drop("column")
(Not directly in SELECT)
Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Rename Column
SQL PySpark
ALTER TABLE table RENAME COLUMN df.withColumnRenamed("column1", "column2")
column1 TO column2; (Not directly in
SELECT)
Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Change Column Type
SQL PySpark
ALTER TABLE table ALTER COLUMN column df.withColumn("column",
TYPE new_type; (Not directly in SELECT) df["column"].cast("new_type"))
Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Creating a Table from Select
SQL PySpark
CREATE TABLE new_table AS SELECT * (df.write.format("parquet").saveAsTable("new_ta
FROM table; ble"))
Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Inserting Selected Data into Table
SQL PySpark
INSERT INTO table2 SELECT * FROM table1; (df1.write.insertInto("table2"))
Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Creating a Table with Specific Columns
SQL PySpark
CREATE TABLE new_table AS SELECT (df.select("column1",
column1, column2 FROM table; "column2").write.format("parquet").saveAsTable(
"new_table"))
Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Aggregate with Alias
SQL PySpark
SELECT column, COUNT(*) AS count FROM df.groupBy("column").agg(F.count("*").alias("coun
table GROUP BY column; t"))
Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Nested Subquery
SQL PySpark
SELECT * FROM (SELECT * FROM table df.filter("condition").alias("sub").filter("sub.condit
WHERE condition) sub WHERE ion2")
sub.condition2;
Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Multiple Joins
SQL PySpark
SELECT * FROM table1 JOIN table2 ON df1.join(df2, "id").join(df3, "id")
table1.id = table2.id JOIN table3 ON
table1.id = table3.id;
Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Cross Join
SQL PySpark
SELECT * FROM table1 CROSS JOIN table2; df1.crossJoin(df2)
Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Group By Having Count Greater Than
SQL PySpark
SELECT column, COUNT(*) FROM table df.groupBy("column").count().filter(F.col("count")
GROUP BY column HAVING COUNT(*) > 1; > 1)
Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Alias for Table in Join
SQL PySpark
SELECT t1.* FROM table1 t1 JOIN table2 t2 df1.alias("t1").join(df2.alias("t2"), F.col("t1.id") ==
ON t1.id = t2.id; F.col("t2.id"))
Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Selecting from Multiple Tables
SQL PySpark
SELECT t1.column, t2.column FROM table1 df1.join(df2, df1.id == df2.id).select(df1.column,
t1, table2 t2 WHERE t1.id = t2.id; df2.column)
Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Case When with Multiple Conditions
SQL PySpark
SELECT CASE WHEN condition THEN df.select(F.when(F.col("condition"),
'value1' WHEN condition2 THEN 'value2' "value1").when(F.col("condition2"),
ELSE 'value3' END FROM table; "value2").otherwise("value3"))
Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Extracting Date Parts
SQL PySpark
SELECT EXTRACT(YEAR FROM date_column) df.select(F.year(F.col("date_column")))
FROM table;
Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Inequality Filtering
SQL PySpark
SELECT * FROM table WHERE column != df.filter(df.column != 'value')
'value';
Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
In List
SQL PySpark
SELECT * FROM table WHERE column IN df.filter(df.column.isin('value1', 'value2'))
('value1', 'value2');
Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Not In List
SQL PySpark
SELECT * FROM table WHERE column NOT df.filter(~df.column.isin('value1', 'value2'))
IN ('value1', 'value2');
Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Null Values
SQL PySpark
SELECT * FROM table WHERE column IS df.filter(df.column.isNull())
NULL;
Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Not Null Values
SQL PySpark
SELECT * FROM table WHERE column IS df.filter(df.column.isNotNull())
NOT NULL;
Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
String Upper Case
SQL PySpark
SELECT UPPER(column) FROM table; df.select(F.upper(df.column))
Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
String Lower Case
SQL PySpark
SELECT LOWER(column) FROM table; df.select(F.lower(df.column))
Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
String Length
SQL PySpark
SELECT LENGTH(column) FROM table; df.select(F.length(df.column))
Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Trim String
SQL PySpark
SELECT TRIM(column) FROM table; df.select(F.trim(df.column))
Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Left Trim String
SQL PySpark
SELECT LTRIM(column) FROM table; df.select(F.ltrim(df.column))
Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Right Trim String
SQL PySpark
SELECT RTRIM(column) FROM table; df.select(F.rtrim(df.column))
Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
String Replace
SQL PySpark
SELECT REPLACE(column, 'find', 'replace') df.select(F.regexp_replace(df.column, 'find',
FROM table; 'replace'))
Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Substring Index
SQL PySpark
SELECT SUBSTRING_INDEX(column, 'delim', df.select(F.expr("split(column, 'delim')[count-1]"))
count) FROM table; (Assuming 1-based index)
Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Date Difference
SQL PySpark
SELECT DATEDIFF('date1', 'date2') FROM df.select(F.datediff(F.col('date1'), F.col('date2')))
table;
Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Add Months to Date
SQL PySpark
SELECT ADD_MONTHS(date_column, df.select(F.add_months(df.date_column,
num_months) FROM table; num_months))
Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
First Value in Group
SQL PySpark
SELECT FIRST_VALUE(column) OVER df.withColumn("first_val",
(PARTITION BY column2) FROM table; F.first("column").over(Window.partitionBy("colum
n2")))
Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Last Value in Group
SQL PySpark
SELECT LAST_VALUE(column) OVER df.withColumn("last_val",
(PARTITION BY column2) FROM table; F.last("column").over(Window.partitionBy("colum
n2")))
Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Row Number Over Partition
SQL PySpark
SELECT ROW_NUMBER() OVER (PARTITION df.withColumn("row_num",
BY column ORDER BY column) FROM table; F.row_number().over(Window.partitionBy("colum
n").orderBy("column")))
Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Rank Over Partition
SQL PySpark
SELECT RANK() OVER (PARTITION BY df.withColumn("rank",
column ORDER BY column) FROM table; F.rank().over(Window.partitionBy("column").order
By("column")))
Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Dense Rank Over Partition
SQL PySpark
SELECT DENSE_RANK() OVER (PARTITION df.withColumn("dense_rank",
BY column ORDER BY column) FROM table; F.dense_rank().over(Window.partitionBy("column"
).orderBy("column")))
Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Count Rows
SQL PySpark
SELECT COUNT(*) FROM table; df.count()
Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Mathematical Operations
SQL PySpark
SELECT column1 + column2 FROM table; df.select(F.col("column1") + F.col("column2"))
Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
String Concatenation
SQL PySpark
SELECT column1 | column2 AS new_column df.withColumn("new_column", F.concat_ws("|",
FROM table; F.col("column1"), F.col("column2")))
Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Find Minimum Value
SQL PySpark
SELECT MIN(column) FROM table; df.select(F.min("column"))
Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Find Maximum Value
SQL PySpark
SELECT MAX(column) FROM table; df.select(F.max("column"))
Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Removing Duplicates
SQL PySpark
SELECT DISTINCT * FROM table; df.distinct()
Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Left Join
SQL PySpark
SELECT * FROM table1 LEFT JOIN table2 ON df1.join(df2, df1.id == df2.id, "left")
table1.id = table2.id;
Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Right Join
SQL PySpark
SELECT * FROM table1 RIGHT JOIN table2 df1.join(df2, df1.id == df2.id, "right")
ON table1.id = table2.id;
Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Full Outer Join
SQL PySpark
SELECT * FROM table1 FULL OUTER JOIN df1.join(df2, df1.id == df2.id, "outer")
table2 ON table1.id = table2.id;
Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Group By with Having
SQL PySpark
SELECT column, COUNT(*) FROM table df.groupBy("column").count().filter(F.col("count")
GROUP BY column HAVING COUNT(*) > 10; > 10)
Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Round Decimal Values
SQL PySpark
SELECT ROUND(column, 2) FROM table; df.select(F.round("column", 2))
Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Get Current Date
SQL PySpark
SELECT CURRENT_DATE(); df.select(F.current_date())
Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Date Addition
SQL PySpark
SELECT DATE_ADD(date_column, 10) FROM df.select(F.date_add(F.col("date_column"), 10))
table;
Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Date Subtraction
SQL PySpark
SELECT DATE_SUB(date_column, 10) FROM df.select(F.date_sub(F.col("date_column"), 10))
table;
Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Extract Year from Date
SQL PySpark
SELECT YEAR(date_column) FROM table; df.select(F.year(F.col("date_column")))
Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Extract Month from Date
SQL PySpark
SELECT MONTH(date_column) FROM table; df.select(F.month(F.col("date_column")))
Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Extract Day from Date
SQL PySpark
SELECT DAY(date_column) FROM table; df.select(F.dayofmonth(F.col("date_column")))
Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Sorting Descending
SQL PySpark
SELECT * FROM table ORDER BY column df.orderBy(F.col("column").desc())
DESC;
Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Group By Multiple Columns
SQL PySpark
SELECT col1, col2, COUNT(*) FROM table df.groupBy("col1", "col2").count()
GROUP BY col1, col2;
Shwetank Singh
GritSetGrow - GSGLearn.com
SQL and PySpark
Conditional Column Update
SQL PySpark
UPDATE table SET column1 = CASE WHEN df.withColumn("column1",
condition THEN 'value1' ELSE 'value2' END; F.when(F.col("condition"),
"value1").otherwise("value2"))
Shwetank Singh
GritSetGrow - GSGLearn.com