Question PySpark Query SQL Query
Select all columns from a
df.select("*").show() SELECT * FROM table_name;
table.
Select specific columns
(e.g., name, age) from a df.select("name", "age").show() SELECT name, age FROM table_name;
table.
Filter rows where age is SELECT * FROM table_name WHERE age >
df.filter(df.age > 30).show()
greater than 30. 30;
Count the number of rows
df.count() SELECT COUNT(*) FROM table_name;
in a table.
Group by a column
SELECT department, COUNT(*) FROM
(e.g., department) and df.groupBy("department").count().show()
table_name GROUP BY department;
count the number of rows.
Calculate the average of a
df.select(avg("salary")).show() SELECT AVG(salary) FROM table_name;
column (e.g., salary).
Join two tables
SELECT * FROM table1 INNER JOIN table2 ON
(df1 and df2) on a common df1.join(df2, "id", "inner").show()
table1.id = table2.id;
column (e.g., id).
Perform a left join on two SELECT * FROM table1 LEFT JOIN table2 ON
df1.join(df2, "id", "left").show()
tables. table1.id = table2.id;
Find duplicate rows based SELECT email, COUNT(*) FROM table_name
df.groupBy("email").count().filter("count > 1").show()
on a column (e.g., email). GROUP BY email HAVING COUNT(*) > 1;
from pyspark.sql.window import Window
Rank rows based on a from pyspark.sql.functions import rank SELECT *, RANK() OVER (ORDER BY salary) AS
column (e.g., salary) using
rank FROM table_name;
window functions. window = Window.orderBy("salary")
df.withColumn("rank", rank().over(window)).show()
from pyspark.sql.window import Window
from pyspark.sql.functions import sum
SELECT date, sales, SUM(sales) OVER (ORDER
Calculate cumulative sum window = BY date ROWS BETWEEN UNBOUNDED
of a column (e.g., sales). Window.orderBy("date").rowsBetween(Window.unbo PRECEDING AND CURRENT ROW) AS
undedPreceding, Window.currentRow) cumulative_sum FROM table_name;
df.withColumn("cumulative_sum",
sum("sales").over(window)).show()
Question PySpark Query SQL Query
SELECT product, SUM(CASE WHEN year =
Pivot a table to transform
df.groupBy("product").pivot("year").agg(sum("sales")) 2021 THEN sales END) AS 2021, SUM(CASE
rows into columns
.show() WHEN year = 2022 THEN sales END) AS 2022
(e.g., year as columns).
FROM table_name GROUP BY product;
WITH RankedSalaries AS (
SELECT
salary,
from pyspark.sql.window import Window
DENSE_RANK() OVER (ORDER BY salary
from pyspark.sql.functions import row_number
Find the third highest DESC) AS dense_rank
value in a column window = Window.orderBy(desc("salary"))
FROM employees
(e.g., salary).
df.withColumn("row_num",
)
row_number().over(window)).filter(col("row_num")
== 3).show() SELECT salary
FROM RankedSalaries
WHERE dense_rank = 3;
Calculate the difference
SELECT revenue - cost AS profit FROM
between two columns df.withColumn("profit", df.revenue - df.cost).show()
table_name;
(e.g., revenue - cost).
Filter rows where a column
SELECT * FROM table_name WHERE id IN (1,
value is in a list (e.g., id in df.filter(df.id.isin([1, 2, 3])).show()
2, 3);
[1, 2, 3]).
Find the top N rows based SELECT * FROM table_name ORDER BY salary
df.orderBy(desc("salary")).limit(5).show()
on a column (e.g., salary). DESC LIMIT 5;
Replace null values in a
SELECT COALESCE(column_name, 0) FROM
column with a default df.na.fill(0, subset=["column_name"]).show()
table_name;
value (e.g., 0).
from pyspark.sql.functions import concat
Concatenate two columns
SELECT CONCAT(first_name, last_name) AS
(e.g., first_name and last_n df.withColumn("full_name", concat(df.first_name, full_name FROM table_name;
ame). df.last_name)).show()
from pyspark.sql.functions import year
Extract year from a date SELECT EXTRACT(YEAR FROM date_column)
column. AS year FROM table_name;
df.withColumn("year", year("date_column")).show()
Calculate the percentage of from pyspark.sql.window import Window SELECT sales, (sales / SUM(sales) OVER ()) *
total for each row 100 AS percentage FROM table_name;
(e.g., sales). from pyspark.sql.functions import sum
window = Window.partitionBy()
Question PySpark Query SQL Query
df.withColumn("percentage", (df.sales /
sum("sales").over(window)) * 100).show()
WITH RankedSalaries AS
from pyspark.sql.window import Window
SELECT department, salary, ROW_NUMBER()
from pyspark.sql.functions import row_number
OVER (PARTITION BY department ORDER BY
window = salary DESC) AS row_num
Find the nth highest salary Window.partitionBy("department").orderBy(desc("sal
department-wise. FROM table_name
ary"))
)
df.withColumn("row_num",
row_number().over(window)).filter(col("row_num") SELECT * FROM RankedSalaries WHERE
== n).show() row_num = n;
WITH AvgSalary AS (
SELECT AVG(salary) AS avg_salary FROM
Use a CTE to find
avg_salary = df.select(avg("salary")).collect()[0][0] table_name
employees with salary
greater than the average df.filter(df.salary > avg_salary).show()
salary.
SELECT * FROM table_name WHERE salary >
(SELECT avg_salary FROM AvgSalary);
df.createOrReplaceTempView("temp_table")
SELECT * FROM table_name WHERE (name,
Find duplicates using a spark.sql("SELECT * FROM temp_table WHERE (name, age) IN (SELECT name, age FROM table_name
subquery. age) IN (SELECT name, age FROM temp_table GROUP GROUP BY name, age HAVING COUNT(*) > 1);
BY name, age HAVING COUNT(*) > 1)").show()
from pyspark.sql.window import Window
from pyspark.sql.functions import dense_rank WITH RankedSalaries AS (
SELECT department, salary, DENSE_RANK()
Find the 3rd highest salary window = OVER (PARTITION BY department ORDER BY
in each department using a Window.partitionBy("department").orderBy(desc("sal salary DESC) AS rank
window function. ary")) FROM table_name
df.withColumn("rank", SELECT * FROM RankedSalaries WHERE rank
dense_rank().over(window)).filter(col("rank") == = 3;
3).show()
with cte as (
SELECT *,
query to obtain the third row_number() over (PARTITION by user_id
transaction of every user. ORDER by transaction_date) as row_num
from transactions)
select user_id,spend, transaction_date from
cte where row_num=3
with cte as
(SELECT salary,row_number() over (order by
second highest salary
salary desc) as row_num
among all employees.
FROM employee)
select salary as second_highest_salary from
cte where row_num=2
Question PySpark Query SQL Query
SELECT user_id,tweet_date,
round(avg(tweet_count) over
Tweets' Rolling Averages (PARTITION by user_id
ORDER BY tweet_date
rows BETWEEN 2 PRECEDING and current
ROW),2)
as rolling_avg_3d
FROM tweets