-----------------------------------------------------------
-- Total Training for SQL Server 2005
--
-- Part 2 - Selecting Data
-- Lesson 4 - Aggregating Data
--
-- Paul Nielsen
-----------------------------------------------------------
-----------------------------------------------------------
-- Using the new Windowing Functions
Use AdventureWorks;
SELECT ProductID, TransactionDate
FROM Production.TransactionHistory
ORDER BY TransactionDate;
SELECT ProductID, TransactionDate,
Row_Number() OVER(ORDER BY TransactionDate) as RowNumDate,
Row_Number() OVER(ORDER BY ProductID) + 5000 as RowNumiD
FROM Production.TransactionHistory
-- not permitted
Where Row_Number() OVER(ORDER BY TransactionDate) Between 10001 AND 20000
ORDER BY TransactionDate;
-- Select mid-range of rownumbers & every 5th row
SELECT RowNum, ProductID, TransactionDate
FROM (SELECT ProductID, TransactionDate,
Row_Number() OVER(ORDER BY TransactionDate) as RowNum
FROM Production.TransactionHistory) sq
WHERE RowNum Between 10001 AND 20000
AND RowNum%5 = 0
ORDER BY RowNum ;
-- Ranking and Windowing
SELECT SalesOrderId, CustomerId, TerritoryID,
ROW_NUMBER() OVER(ORDER BY CustomerId) AS RowNum,
RANK() OVER(ORDER BY CustomerId) AS [Rank],
DENSE_RANK() OVER(ORDER BY CustomerId) AS [DenseRank],
NTILE(5) OVER(ORDER BY CustomerId) AS NTile5,
RANK() OVER(PARTITION BY TerritoryID ORDER BY CustomerId) AS [Partition]
FROM Sales.SalesOrderHeader;
-- Percentile Ranking
SELECT SalesOrderId, CustomerId,
NTILE(100) OVER(ORDER BY CustomerId) AS Percentile
FROM Sales.SalesOrderHeader
ORDER BY CustomerId
-- Sample Data
SELECT
SalesOrderId, CustomerId
FROM Sales.SalesOrderHeader TABLESAMPLE (10000 rows)
SELECT
SalesOrderId, CustomerId
FROM Sales.SalesOrderHeader TABLESAMPLE (10 percent)
ORDER BY CustomerId
--------------------------------------------------------------------
-- Using Aggregate Functions
go
-- Build the sample data
USE tempdb
IF EXISTS(SELECT * FROM sysobjects WHERE Name = 'RawData')
DROP TABLE RawData
go
CREATE TABLE RawData (
Region VARCHAR(10),
Category CHAR(1),
Amount INT,
SalesDate DateTime
)
go
INSERT RawData (Region, Category, Amount, SalesDate)
VALUES( 'South', 'Y', 12, '11/1/2005')
INSERT RawData (Region, Category, Amount, SalesDate)
VALUES( 'South', 'Y', 24, '11/1/2005')
INSERT RawData (Region, Category, Amount, SalesDate)
VALUES( 'South', 'Y', 15, '12/1/2005')
INSERT RawData (Region, Category, Amount, SalesDate)
VALUES( 'NorthEast', 'Y', 28, '12/1/2005')
INSERT RawData (Region, Category, Amount, SalesDate)
VALUES( 'South', 'X', 11, '1/1/2006')
INSERT RawData (Region, Category, Amount, SalesDate)
VALUES( 'MidWest', 'X', 24, '1/1/2006')
INSERT RawData (Region, Category, Amount, SalesDate)
VALUES( 'West', 'X', 36, '2/1/2006')
INSERT RawData (Region, Category, Amount, SalesDate)
VALUES( 'South', 'Y', 47, '2/1/2006')
INSERT RawData (Region, Category, Amount, SalesDate)
VALUES( 'MidWest', 'Y', 38, '3/1/2006')
INSERT RawData (Region, Category, Amount, SalesDate)
VALUES( 'NorthEast', 'Y', 62, '3/1/2006')
INSERT RawData (Region, Category, Amount, SalesDate)
VALUES( 'South', 'Z', 33, '4/1/2006')
INSERT RawData (Region, Category, Amount, SalesDate)
VALUES( 'MidWest', 'Z', 83, '4/1/2006')
INSERT RawData (Region, Category, Amount, SalesDate)
VALUES( 'West', 'Z', 44, '5/1/2006')
INSERT RawData (Region, Category, Amount, SalesDate)
VALUES( 'NorthEast', 'Z', 55, '5/1/2006')
INSERT RawData (Region, Category, Amount, SalesDate)
VALUES( 'South', 'X', 68, '6/1/2006')
INSERT RawData (Region, Category, Amount, SalesDate)
VALUES( 'South', 'X', 86, '6/1/2006')
INSERT RawData (Region, Category, Amount, SalesDate)
VALUES( 'South', 'Y', 54, '7/1/2006')
INSERT RawData (Region, Category, Amount, SalesDate)
VALUES( 'South', 'Y', 63, '7/1/2006')
INSERT RawData (Region, Category, Amount, SalesDate)
VALUES( 'South', 'Y', 72, '8/1/2006')
INSERT RawData (Region, Category, Amount, SalesDate)
VALUES( 'NorthEast', 'Y', 91, '8/1/2006')
INSERT RawData (Region, Category, Amount, SalesDate)
VALUES( 'NorthEast', 'Y', null, '8/1/2006')
INSERT RawData (Region, Category, Amount, SalesDate)
VALUES( 'NorthEast', 'Y', null, '8/1/2006')
INSERT RawData (Region, Category, Amount, SalesDate)
VALUES( 'NorthEast', 'Y', null, '8/1/2006')
INSERT RawData (Region, Category, Amount, SalesDate)
VALUES( 'NorthEast', 'Y', null, '8/1/2006')
-- check the Amount
SELECT * FROM RawData
-- Simple Aggregations
SELECT
Count(*) as Count,
Sum(Amount) as [Sum],
Avg(Amount) as [Avg],
Min(Amount) as [Min],
Max(Amount) as [Max]
FROM RawData
-- How Nulls affect Averages
SELECT Avg(Cast((Amount)as Numeric(9,5))) as [Numeric Avg],
Avg(Amount) as [Int Avg],
Sum(Amount) / Count(*) as [Manual Avg]
FROM RawData
-- Statistical Functions
SELECT
Count(*) as Count,
StDevP(Amount) as [StDevP],
VarP(Amount) as [VarP]
FROM RawData
SELECT
Count(*) as Count,
StDev(Amount) as [StDev],
Var(Amount) as [Var]
FROM RawData
WHERE Year(SalesDate) = 2006
---------------------------------------------
-- Grouping and Aggregating
SELECT Category,
Count(*) as Count,
Sum(Amount) as [Sum],
Avg(Amount) as [Avg],
Min(Amount) as [Min],
Max(Amount) as [Max]
FROM RawData
GROUP BY Category
-- Group by occurs after the where clause
SELECT Category,
Count(*) as Count,
Sum(Amount) as [Sum],
Avg(Amount) as [Avg],
Min(Amount) as [Min],
Max(Amount) as [Max]
FROM RawData
WHERE Year(SalesDate) = 2006
GROUP BY Category
-- Grouping By multiple columns
SELECT Year(SalesDate) as [Year], DatePart(q,SalesDate) as [Quarter],
Count(*) as Count,
Sum(Amount) as [Sum],
Avg(Amount) as [Avg],
Min(Amount) as [Min],
Max(Amount) as [Max]
FROM RawData
GROUP BY Year(SalesDate), DatePart(q,SalesDate)
---------------------------------------------
-- Including Descriptions
/*
-- This example SHOULD generate an error...
SELECT Region, Count(*)
FROM RawData
*/
IF EXISTS(SELECT * FROM sysobjects WHERE Name = 'RawCategory')
DROP TABLE RawCategory
CREATE TABLE RawCategory (
RawCategoryID CHAR(1),
CategoryName VARCHAR(25)
)
INSERT RawCategory (RawCategoryID, CategoryName)
VALUES ('X', 'Sci-Fi')
INSERT RawCategory (RawCategoryID, CategoryName)
VALUES ('Y', 'Philosophy')
INSERT RawCategory (RawCategoryID, CategoryName)
VALUES ('Z', 'Zoology')
-- including Amount outside the aggregate function or group by will cause an error
/*
SELECT Category, CategoryName,
Sum(Amount) as [Sum],
Avg(Amount) as [Avg],
Min(Amount) as [Min],
Max(Amount) as [Max]
FROM RawData R
JOIN RawCategory C
ON R.Category = C.RawCategoryID
GROUP BY Category
*/
-- Solution 1: include all Amount in the Group By
SELECT Category, CategoryName,
Sum(Amount) as [Sum],
Avg(Amount) as [Avg],
Min(Amount) as [Min],
Max(Amount) as [Max]
FROM RawData R
JOIN RawCategory C
ON R.Category = C.RawCategoryID
GROUP BY Category, CategoryName
ORDER BY Category, CategoryName
-- Solution 2: Aggregate in Subquery, description in outer query
SELECT sq.Category, CategoryName, sq.[Sum], sq.[Avg], sq.[Min], sq.[Max]
FROM (SELECT Category,
Sum(Amount) as [Sum],
Avg(Amount) as [Avg],
Min(Amount) as [Min],
Max(Amount) as [Max]
FROM RawData
GROUP BY Category ) sq
JOIN RawCategory C
ON sq.Category = C.RawCategoryID
ORDER BY Category, CategoryName
--------------------------------------------------------------
-- Including all Group By Values
-- Left Outer Join Group Bys
SELECT Year(SalesDate) AS [Year],
Count(*) as Count,
Sum(Amount) as [Sum],
Avg(Amount) as [Avg],
Min(Amount) as [Min],
Max(Amount) as [Max]
FROM RawData
WHERE Year(SalesDate) = 2006 -- (There are a few 2005 rows in the RawData table)
GROUP BY ALL Year(SalesDate)
------------------------------------------------------------------
-- Nesting Aggregations
-- Which Category sold the most in each quarter?
-- Can't nest aggregate function - error:
/*
Select Y,Q, Max(Sum) as MaxSum
FROM ( -- Calculate Sums
SELECT Category, Year(SalesDate) as Y, DatePart(q,SalesDate) as Q, max(Sum(Amount)) as Sum
FROM RawData
GROUP BY Category, Year(SalesDate), DatePart(q,SalesDate)
) sq
GROUP BY Y,Q
ORDER BY Y,Q
*/
-- Solution: Including Detail description
SELECT MaxQuery.Y, MaxQuery.Q, AllQuery.Category, MaxQuery.MaxSum as sales
FROM (-- Find Max Sum Per Year/Quarter
Select Y,Q, Max(Sum) as MaxSum
FROM ( -- Calculate Sums
SELECT Category, Year(SalesDate) as Y, DatePart(q,SalesDate) as Q, Sum(Amount) as Sum
FROM RawData
GROUP BY Category, Year(SalesDate), DatePart(q,SalesDate)
) sq
GROUP BY Y,Q
) MaxQuery
JOIN (-- All Amount Query
SELECT Category, Year(SalesDate) as Y, DatePart(q,SalesDate) as Q, Sum(Amount) as Sum
FROM RawData
GROUP BY Category, Year(SalesDate), DatePart(q,SalesDate)
)AllQuery
ON MaxQuery.Y = AllQuery.Y
AND MaxQuery.Q = AllQuery.Q
AND MaxQuery.MaxSum = AllQuery.Sum
ORDER BY MaxQuery.Y, MaxQuery.Q
---------------------------------------------
-- Filtering Group By Results
SELECT Year(SalesDate) as [Year],
DatePart(q,SalesDate) as [Quarter],
Count(*) as Count,
Sum(Amount) as [Sum],
Avg(Amount) as [Avg]
FROM RawData
GROUP BY Year(SalesDate), DatePart(q,SalesDate)
HAVING Avg(Amount) > 25
ORDER BY [Year], [Quarter]
--------------------------------------------------------
-- Adding Grand Totals
-- Sum of single column Group By (Roll-Up)
SELECT Grouping(Category), Category,
CASE Grouping(Category)
WHEN 0 THEN Category
WHEN 1 THEN 'All Categories'
END AS Category,
Count(*) as Count
FROM RawData
GROUP BY Category
WITH ROLLUP
-- Sum of Mulitple column Group By (Roll-Up)
-- Second Group by Column w/ subtotals
SELECT
CASE Grouping(Category)
WHEN 0 THEN Category
WHEN 1 THEN 'All Categories'
END AS Category,
CASE Grouping(Year(SalesDate))
WHEN 0 THEN Cast(Year(SalesDate) as CHAR(8))
WHEN 1 THEN 'All Years'
END AS Year,
Count(*) as Count
FROM RawData
GROUP BY Category, Year(SalesDate)
WITH ROLLUP
-- Multidimensional column Group By (Cube)
SELECT
CASE Grouping(Category)
WHEN 0 THEN Category
WHEN 1 THEN 'All Categories'
END AS Category,
CASE Grouping(Year(SalesDate))
WHEN 0 THEN Cast(Year(SalesDate) as CHAR(8))
WHEN 1 THEN 'All Years'
END AS Year, Count(*) as Count
FROM RawData
GROUP BY Category, Year(SalesDate)
WITH CUBE
-- Compute
SELECT Category, SalesDate, Amount
FROM RawData
WHERE Year(SalesDate) = '2006'
COMPUTE Avg(Amount), sum(Amount)
SELECT Category, SalesDate, Amount
FROM RawData
WHERE Year(SalesDate) = '2006'
ORDER BY Category
COMPUTE Avg(Amount), sum(Amount)
BY Category
SELECT Category, SalesDate, Amount
FROM RawData
WHERE Year(SalesDate) = '2006'
ORDER BY Category
COMPUTE avg(Amount), sum(Amount)
COMPUTE sum(Amount)
BY Category
-------------------------------------------------------------------
-- Generating Crosstab(Pivot) Tables
-- Fixed Column CrossTab with Category Subtotal- CASE Method
SELECT Category,
SUM(Case Region WHEN 'South' THEN Amount ELSE 0 END) AS South,
SUM(Case Region WHEN 'NorthEast' THEN Amount ELSE 0 END) AS NorthEast,
SUM(Case Region WHEN 'MidWest' THEN Amount ELSE 0 END) AS MidWest,
SUM(Case Region WHEN 'West' THEN Amount ELSE 0 END) AS West,
SUM(Amount) as Total
FROM RawData
GROUP BY Category
ORDER BY Category
-- Fixed Column Crosstab - PIVOT Method
SELECT Category, SalesDate, South, NorthEast, MidWest, West
FROM RawData
PIVOT
(Sum (Amount)
FOR Region IN (South, NorthEast, MidWest, West)
) AS pt
SELECT Category, South, NorthEast, MidWest, West
FROM (Select Category, Region, Amount from RawData) sq
PIVOT
(Sum (Amount)
FOR Region IN (South, NorthEast, MidWest, West)
) AS pt
-- Fixed Column Crosstab with Category Subtotal - PIVOT Method
SELECT Category, South, NorthEast, MidWest, West,
IsNull(South,0) + IsNull(NorthEast,0) + IsNull(MidWest,0) + IsNull(West,0) as Total
FROM (Select Category, Region, Amount from RawData) sq
PIVOT
(Sum (Amount)
FOR Region IN (South, NorthEast, MidWest, West)
) AS pt
-- Fixed Column Crosstab with Filter - PIVOT Method
-- Must filter within the FROM clause (using subquery) prior to Pivot operation
SELECT Category, South, NorthEast, MidWest, West,
IsNull(South,0) + IsNull(NorthEast,0) + IsNull(MidWest,0) + IsNull(West,0) as Total
FROM (Select Region, Category, Amount
From RawData
Where Category = 'Z') sq
PIVOT
(Sum (Amount)
FOR Region IN (South, NorthEast, MidWest, West)
) AS pt
-- Dynamic CrossTabs with Multiple Assignment Variable and Pivot Method
-- Appending to a variable within a query to dynamically generate the column names
DECLARE @SQLStr NVARCHAR(1024)
SET @SQLStr = ''
SELECT @SQLStr = @SQLStr + [a].[Column] + ', '
FROM
(SELECT DISTINCT Region as [Column]
FROM RawData ) as a
SET @SQLStr = Left(@SQLStr, Len(@SQLStr) - 1)
SET @SQLStr = 'SELECT Category, '
+ @SQLStr
+ ' FROM (Select Category, Region, Amount from RawData) sq PIVOT (Sum (Amount) FOR Region IN ('
+ @SQLStr
+ ')) AS pt'
PRINT @SQLStr
EXEC sp_executesql @SQLStr
---------------------------------------------------------------
-- Calculating Cummulative Sums (Running Totals)
-- Simple Correlated Subquery Method
-- Groups same DateTime data together
SELECT SalesDate, Amount,
(SELECT SUM(Amount)
FROM RawData
WHERE SalesDate <= R.SalesDate ) as Balance
FROM RawData R
ORDER BY SalesDate
-- Cursor based - Improved Solution
ALTER TABLE RawData
ADD Balance INT
ALTER TABLE RawData
ADD ID INT IDENTITY NOT NULL PRIMARY KEY
-- Post some balances
UPDATE RawData
SET Balance =
(SELECT SUM(Amount)
FROM RawData
WHERE SalesDate <= R.SalesDate )
FROM RawData R
WHERE SalesDate <= '4/1/2006'
-- Cursor - 5 steps
DECLARE
@BeginningBalanceDate DateTime,
@BeginningBalanceAmount INT,
@CurrentBalance INT,
@CurrentID INT,
@CurrentAmount INT
SELECT
@BeginningBalanceDate = SalesDate,
@CurrentBalance = Balance
FROM RawData
WHERE SalesDate =
(SELECT Max(SalesDate) -- Identify the last good balance date
FROM RawData
WHERE SalesDate < -- row prior to earliest row with null balance
(SELECT Min(SalesDate)
FROM RawData
WHERE Balance IS NULL))
-- 1 Declare the cursor
-- Select all rows following last row with a good balance
-- (this will handle gaps in the balance)
DECLARE cBalance CURSOR FAST_FORWARD READ_ONLY
FOR SELECT ID, Amount
FROM RawData
WHERE SalesDate > @BeginningBalanceDate
ORDER BY SalesDate
-- 2 Open the Cursor
OPEN cBalance
-- 3 Prime the Cursor, then loop
FETCH cBalance INTO @CurrentID, @CurrentAmount -- prime the cursor
WHILE @@Fetch_Status = 0
BEGIN
SET @CurrentBalance = @CurrentBalance + IsNull(@CurrentAmount, 0)
UPDATE RawData
SET Balance = @CurrentBalance
WHERE ID = @CurrentID
PRINT @CurrentBalance
-- 3
FETCH cBalance INTO @CurrentID, @CurrentAmount -- fetch next
END
-- 4 Close
CLOSE cBalance
-- 5 Clean up Memory
DEALLOCATE cBalance
-- view the result
SELECT * FROM RawData ORDER BY SalesDate