KEMBAR78
SQL Guide for Data Engineers | PDF | Sql | Data Management Software
0% found this document useful (0 votes)
86 views7 pages

SQL Guide for Data Engineers

Uploaded by

Ramesh chaudhary
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
86 views7 pages

SQL Guide for Data Engineers

Uploaded by

Ramesh chaudhary
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 7

SQL Handbook for Data Engineering

Table of Contents
1. Basic SQL Foundations
2. Window Functions
3. Advanced Joins
4. Common Table Expressions (CTEs)
5. Advanced SQL Features
6. Performance Optimization
7. Real-world Examples

Basic SQL Foundations


Data Types
-- Numeric Types
INTEGER, BIGINT, DECIMAL(p,s), FLOAT
-- String Types
VARCHAR(n), CHAR(n), TEXT
-- Date/Time Types
DATE, TIME, TIMESTAMP
-- Others
BOOLEAN, ARRAY, JSON

Basic Query Structure


SELECT column1, column2
FROM table_name
WHERE condition
GROUP BY column1
HAVING group_condition
ORDER BY column1 [ASC|DESC]
LIMIT n;

Window Functions
Syntax
window_function OVER (
[PARTITION BY partition_expression]
[ORDER BY sort_expression [ASC|DESC]]
[ROWS|RANGE frame_extent]
)

Types of Window Functions


Ranking Functions

1
-- Row Number: Unique sequential number
SELECT
employee_name,
department,
salary,
ROW_NUMBER() OVER (PARTITION BY department ORDER BY salary DESC) as rank
FROM employees;

-- RANK(): Same salary = same rank, skips next ranks


SELECT
employee_name,
salary,
RANK() OVER (ORDER BY salary DESC) as rank
FROM employees;

-- DENSE_RANK(): Same salary = same rank, no rank skipping


SELECT
employee_name,
salary,
DENSE_RANK() OVER (ORDER BY salary DESC) as dense_rank
FROM employees;

Aggregate Window Functions


-- Running total
SELECT
date,
amount,
SUM(amount) OVER (ORDER BY date) as running_total
FROM sales;

-- Moving average
SELECT
date,
amount,
AVG(amount) OVER (
ORDER BY date
ROWS BETWEEN 6 PRECEDING AND CURRENT ROW
) as weekly_avg
FROM sales;

Value Functions
-- LAG: Access previous row's value
SELECT
date,

2
amount,
LAG(amount) OVER (ORDER BY date) as previous_day_amount
FROM sales;

-- LEAD: Access next row's value


SELECT
date,
amount,
LEAD(amount) OVER (ORDER BY date) as next_day_amount
FROM sales;

Advanced Joins
Types of Joins with Examples
-- INNER JOIN
SELECT o.order_id, c.customer_name
FROM orders o
INNER JOIN customers c ON o.customer_id = c.customer_id;

-- LEFT JOIN with multiple conditions


SELECT p.product_name, c.category_name, s.supplier_name
FROM products p
LEFT JOIN categories c
ON p.category_id = c.category_id
LEFT JOIN suppliers s
ON p.supplier_id = s.supplier_id;

-- FULL OUTER JOIN


SELECT e.employee_name, d.department_name
FROM employees e
FULL OUTER JOIN departments d
ON e.department_id = d.department_id;

-- CROSS JOIN
SELECT p.product_name, c.color
FROM products p
CROSS JOIN colors c;

Self Joins
-- Finding employees and their managers
SELECT
e1.employee_name as employee,
e2.employee_name as manager
FROM employees e1
LEFT JOIN employees e2 ON e1.manager_id = e2.employee_id;

3
Common Table Expressions (CTEs)
Basic CTE
WITH monthly_sales AS (
SELECT
DATE_TRUNC('month', sale_date) as month,
SUM(amount) as total_sales
FROM sales
GROUP BY DATE_TRUNC('month', sale_date)
)
SELECT
month,
total_sales,
LAG(total_sales) OVER (ORDER BY month) as prev_month_sales
FROM monthly_sales;

Recursive CTE
-- Employee hierarchy
WITH RECURSIVE emp_hierarchy AS (
-- Base case: top-level employees (no manager)
SELECT
employee_id,
employee_name,
manager_id,
1 as level
FROM employees
WHERE manager_id IS NULL

UNION ALL

-- Recursive case
SELECT
e.employee_id,
e.employee_name,
e.manager_id,
h.level + 1
FROM employees e
INNER JOIN emp_hierarchy h ON e.manager_id = h.employee_id
)
SELECT * FROM emp_hierarchy;

4
Advanced SQL Features
Subqueries
-- Correlated subquery
SELECT
department_name,
(SELECT AVG(salary)
FROM employees e
WHERE e.department_id = d.department_id) as avg_salary
FROM departments d;

-- Subquery in WHERE clause


SELECT product_name
FROM products
WHERE price > (
SELECT AVG(price)
FROM products
);

CASE Statements
SELECT
order_id,
amount,
CASE
WHEN amount < 100 THEN 'Small Order'
WHEN amount < 1000 THEN 'Medium Order'
ELSE 'Large Order'
END as order_size
FROM orders;

Performance Optimization
Indexing
-- Create index
CREATE INDEX idx_employee_department
ON employees(department_id);

-- Create composite index


CREATE INDEX idx_order_customer_date
ON orders(customer_id, order_date);

Query Optimization Tips


1. Use EXISTS instead of IN for better performance

5
-- Better
SELECT * FROM orders o
WHERE EXISTS (
SELECT 1
FROM customers c
WHERE c.customer_id = o.customer_id
AND c.country = 'USA'
);

-- Less efficient
SELECT * FROM orders
WHERE customer_id IN (
SELECT customer_id
FROM customers
WHERE country = 'USA'
);
2. Avoid SELECT *
-- Better
SELECT customer_id, order_date, amount
FROM orders;

-- Less efficient
SELECT *
FROM orders;

Real-world Examples
Customer Cohort Analysis
WITH first_purchase AS (
SELECT
customer_id,
DATE_TRUNC('month', MIN(order_date)) as cohort_month
FROM orders
GROUP BY customer_id
),
cohort_data AS (
SELECT
o.customer_id,
f.cohort_month,
DATE_DIFF('month', f.cohort_month, o.order_date) as month_number
FROM orders o
JOIN first_purchase f ON o.customer_id = f.customer_id
)
SELECT

6
cohort_month,
COUNT(DISTINCT CASE WHEN month_number = 0 THEN customer_id END) as month_0,
COUNT(DISTINCT CASE WHEN month_number = 1 THEN customer_id END) as month_1,
COUNT(DISTINCT CASE WHEN month_number = 2 THEN customer_id END) as month_2
FROM cohort_data
GROUP BY cohort_month
ORDER BY cohort_month;

Sales Forecasting
WITH sales_stats AS (
SELECT
DATE_TRUNC('month', sale_date) as month,
SUM(amount) as total_sales,
AVG(amount) OVER (
ORDER BY DATE_TRUNC('month', sale_date)
ROWS BETWEEN 3 PRECEDING AND 1 PRECEDING
) as moving_avg
FROM sales
GROUP BY DATE_TRUNC('month', sale_date)
)
SELECT
month,
total_sales,
moving_avg,
ROUND(moving_avg * 1.1, 2) as next_month_forecast
FROM sales_stats
ORDER BY month;

Best Practices
1. Always use meaningful aliases
2. Format queries for readability
3. Comment complex logic
4. Use CTEs instead of nested subqueries
5. Consider performance implications for large datasets
6. Use appropriate indexes
7. Regular EXPLAIN ANALYZE to check query performance
Remember: The key to mastering SQL is practice and understanding the under-
lying concepts rather than memorizing syntax. Start with simple queries and
gradually work your way up to more complex ones.

You might also like