|
At many clients I see that they import files one row at a time using cursors. There is a similar theme of inserting one row into a table, getting the @@identity, using that to insert one row into a child table, getting that @@identity and inserting into a third related table etc. then moving on to the next row in the file--all in one huge transaction. Toy Scenario: 1 I would like to use set based processing for speed 2 I have 10,000,000 rows in a flat data file that need processing. Given the size I don't want it all in one transaction 3 All updates/inserts should be in batches to mitigate locking contention 4 If the server dies in the middle of the import I need to be able to return the state of the db to before the import started I have come up with the solution below. What I don't like Is I am writing twice, once in the table and once to update the IDs in my import/work table so I'm thinking there is room for speed enhancement. Is there a better solution? What have you done in the past?
CREATE TABLE SSCImport(ImportRowID int identity,ImportID uniqueidentifier,Name varchar(50),Number varchar(50)/* 100 more fields*/,CallAgentID int,CallTime datetime, CustomerID int null,CallLogID int /*...18 more table ids*/)
CREATE TABLE #tmpImported(ImportedRowID int)
CREATE TABLE SSCCustomer(CustomerID int identity, Name varchar(50),Number varchar(50),ImportRowID int,ImportID uniqueidentifier)
CREATE TABLE SSCCallLog(CallLogID int identity,CallAgentID int,CustomerID int,CallTime datetime,ImportRowID int,ImportID uniqueidentifier)
--...18 more tables
CREATE CLUSTERED INDEX idx ON SSCImport(ImportRowID)
CREATE CLUSTERED INDEX idx ON #tmpImported(ImportedRowID)
CREATE CLUSTERED INDEX idx ON SSCCallLog(CallLogID)
go
CREATE FUNCTION dbo.GetExistingCustomerID(@Name varchar(50),@Number varchar(50))
RETURNS int
AS
BEGIN
--complex multi-statement logic to determine if customer exists
DECLARE @CustomerID int
SET @CustomerID = CASE WHEN @Name='Alice' AND @Number='1111111111' THEN 1 ELSE NULL END
RETURN @CustomerID
END
go
DECLARE @ImportID uniqueidentifier
SET @ImportID = NEWID()
--create existing customer
INSERT SSCCustomer
SELECT 'Alice','1111111111',1,NEWID()
--bulk insert records into import table (10,000,000 rows)
INSERT SSCImport
SELECT @ImportID,'Alice','1111111111',1,'2010-01-09 09:47:44.410',NULL,NULL UNION ALL
SELECT @ImportID,'Mary','2222222222',2,'2010-01-09 010:47:44.410',NULL,NULL UNION ALL
SELECT @ImportID,'Bob','3333333333',3,'2010-01-09 011:47:44.410',NULL,NULL
DECLARE @BatchRows int,@RowCount int
SET @BatchRows = 5000
--query from MDM/CDI store to get existing customerid, store customerid in import table for later reference
BEGIN TRY
WHILE 1=1
BEGIN
BEGIN TRAN
;WITH cteImported AS(
SELECT ImportedRowID from #tmpImported
)
UPDATE TOP(@BatchRows) i
SET CustomerID = dbo.GetExistingCustomerID(Name,Number)
OUTPUT inserted.ImportRowID INTO #tmpImported
FROM SSCImport i
LEFT JOIN cteImported ON (cteImported.ImportedRowID = i.ImportRowID)
WHERE cteImported.ImportedRowID IS NULL
AND i.ImportID = @ImportID
SET @RowCount = @@ROWCOUNT
COMMIT TRAN
IF @RowCount < @BatchRows BREAK
END
--insert new customers
TRUNCATE TABLE #tmpImported
WHILE 1=1
BEGIN
BEGIN TRAN
;WITH cteImported AS(
SELECT ImportedRowID from #tmpImported
)
INSERT TOP (@BatchRows) SSCCustomer
OUTPUT inserted.ImportRowID INTO #tmpImported
SELECT Name,Number,ImportRowID,@ImportID
FROM SSCImport i
LEFT JOIN cteImported ON (cteImported.ImportedRowID = i.ImportRowID)
WHERE cteImported.ImportedRowID IS NULL
AND i.CustomerID IS NULL
AND i.ImportID = @ImportID
SET @RowCount = @@ROWCOUNT
--store customerid in import table for later reference
UPDATE i
SET CustomerID = c.CustomerID
FROM SSCCustomer c
JOIN SSCImport i ON (i.ImportID = c.ImportID AND i.ImportRowID = c.ImportRowID)
COMMIT TRAN
IF @RowCount < @BatchRows BREAK
END
--insert new call log data
TRUNCATE TABLE #tmpImported
WHILE 1=1
BEGIN
BEGIN TRAN
;WITH cteImported AS(
SELECT ImportedRowID from #tmpImported
)
INSERT TOP (@BatchRows) SSCCallLog
OUTPUT inserted.ImportRowID INTO #tmpImported
SELECT CallAgentID,CustomerID,CallTime,ImportRowID,@ImportID
FROM SSCImport i
LEFT JOIN cteImported ON (cteImported.ImportedRowID = i.ImportRowID)
WHERE cteImported.ImportedRowID IS NULL
AND i.CustomerID IS NOT NULL
AND i.ImportID = @ImportID
SET @RowCount = @@ROWCOUNT
--store calllogid in import table for later reference
UPDATE i
SET CallLogID = cl.CallLogID
FROM SSCCallLog cl
JOIN SSCImport i ON (i.ImportID = cl.ImportID AND i.ImportRowID = cl.ImportRowID)
COMMIT TRAN
IF @RowCount < @BatchRows BREAK
END
-- ...update 18 more tables
END TRY
BEGIN CATCH
IF XACT_STATE() <> 0 ROLLBACK TRAN
DELETE FROM SSCCustomer WHERE ImportID = @ImportID
DELETE FROM SSCCallLog WHERE ImportID = @ImportID
END CATCH
SELECT * FROM SSCImport
SELECT * FROM SSCCustomer
SELECT * FROM SSCCallLog
DROP TABLE SSCImport,#tmpImported,SSCCustomer,SSCCallLog
DROP FUNCTION dbo.GetExistingCustomerID
(comments are locked)
|
|
I would say that this looks pretty sweet to me. I've had very similar experiences in the past, particularly with one system that had to load data from 350 petrol stations in near-real time. Initially there was an RBAR solution, and I re-wrote it to something similar to the above, which really made the whole thing a lot less painful. The only other solution that I thought of was to use GUIDs instead of int IDENTITY rows, and use those for all primary/foreign keys. That way, you could pre-process the data on the client, and load directly into the target tables using BCP, knowing that you had already related one set of data to the other using client-generated GUIDs. However, I thought that if I ever had to load a large set of data (which I did once, after a tech guy broke the storage system by removing a disk shelf accidentally), then the amount of memory needed to do the client side relational work would be pretty prohibitive. I also thought that it would be slightly arrogant of me to assume that I would be better at handling large data operations than the people who wrote SQL Server... Interesting point about relating the data on the client using GUIDs. Files could be split and the processing spread across multiple pcs if needed. Then SSIS or bcp could be used to dump the data quite quickly. I had to do this years ago and didn't know if there was a better way now.
Jan 09 '10 at 07:10 PM
Scot Hauder
You've got me thinking, I like the idea of client side relating but I also like 4 byte IDS. If only there were an "ID broker" where you could request/reserve a range of IDs, do the relating on the client and then insert at your leisure. What comes to mind is take an exclusive lock on the table and reseed the identity+=@batchsize then do an identity insert on to add the records. I might get this thing to fly
Jan 09 '10 at 09:51 PM
Scot Hauder
(comments are locked)
|
|
Ok Matt, here ya go. I will try it out on a 10,000,000 file
CREATE TABLE SSCImport(ImportRowID int identity,ImportID uniqueidentifier,Name varchar(50),Number varchar(50)/* 100 more fields*/,CallAgentID int,CallTime datetime, CustomerID int null,CallLogID int /*...18 more table ids*/)
CREATE TABLE #tmpImported(ImportedRowID int)
CREATE TABLE SSCCustomer(CustomerID int identity(1,1), Name varchar(50),Number varchar(50),ImportRowID int,ImportID uniqueidentifier)
CREATE TABLE SSCCallLog(CallLogID int identity(1,1),CallAgentID int,CustomerID int,CallTime datetime,ImportRowID int,ImportID uniqueidentifier)
--...18 more tables
CREATE CLUSTERED INDEX idx ON SSCImport(ImportRowID)
CREATE CLUSTERED INDEX idx ON #tmpImported(ImportedRowID)
CREATE CLUSTERED INDEX idx ON SSCCallLog(CallLogID)
go
CREATE PROC ReserveIDs(@TableName varchar(50),@NumberOfIDs int, @RangeStart int OUT)
WITH EXECUTE AS OWNER
AS
BEGIN
SET NOCOUNT ON
DECLARE @ret int
BEGIN TRAN
IF @TableName = 'SSCCustomer'
BEGIN
SELECT @ret = COUNT(*) FROM SSCCustomer WITH(HOLDLOCK,TABLOCKX)
SELECT @RangeStart = IDENT_CURRENT(@TableName)
SET @NumberOfIDs = @NumberOfIDs + @RangeStart
DBCC CHECKIDENT(@TableName,RESEED, @NumberOfIDs)
END
IF @TableName = 'SSCCallLog'
BEGIN
SELECT @ret = COUNT(*) FROM SSCCallLog WITH(HOLDLOCK,TABLOCKX)
SELECT @RangeStart = IDENT_CURRENT(@TableName)
SET @NumberOfIDs = @NumberOfIDs + @RangeStart
DBCC CHECKIDENT(@TableName,RESEED, @NumberOfIDs)
END
COMMIT TRAN
END
go
CREATE FUNCTION dbo.GetExistingCustomerID(@Name varchar(50),@Number varchar(50))
RETURNS int
AS
BEGIN
--complex multi-statement logic to determine if customer exists
DECLARE @CustomerID int
SET @CustomerID = CASE WHEN @Name='Alice' AND @Number='1111111111' THEN 1 ELSE NULL END
RETURN @CustomerID
END
go
DECLARE @ImportID uniqueidentifier
SET @ImportID = NEWID()
--create existing customer
INSERT SSCCustomer
SELECT 'Alice','1111111111',1,NEWID()
--bulk insert records into import table (10,000,000 rows)
INSERT SSCImport
SELECT @ImportID,'Alice','1111111111',1,'2010-01-09 09:47:44.410',NULL,NULL UNION ALL
SELECT @ImportID,'Mary','2222222222',2,'2010-01-09 010:47:44.410',NULL,NULL UNION ALL
SELECT @ImportID,'Bob','3333333333',3,'2010-01-09 011:47:44.410',NULL,NULL
DECLARE @BatchRows int,@RowCount int, @NumberOfRows int, @RangeStart int, @RangeEnd int, @NewID int
SET @BatchRows = 5000
--query from MDM/CDI store to get existing customerid, store customerid in import table for later reference
BEGIN TRY
WHILE 1=1
BEGIN
BEGIN TRAN
;WITH cteImported AS(
SELECT ImportedRowID from #tmpImported
)
UPDATE TOP(@BatchRows) i
SET CustomerID = dbo.GetExistingCustomerID(Name,Number)
OUTPUT inserted.ImportRowID INTO #tmpImported
FROM SSCImport i
LEFT JOIN cteImported ON (cteImported.ImportedRowID = i.ImportRowID)
WHERE cteImported.ImportedRowID IS NULL
AND i.ImportID = @ImportID
SET @RowCount = @@ROWCOUNT
COMMIT TRAN
IF @RowCount < @BatchRows BREAK
END
--get number of new customer rows to insert
SELECT @NumberOfRows = COUNT(*)
FROM SSCImport
WHERE CustomerID IS NULL
--update import table with ids just reserved
EXEC dbo.ReserveIDs 'SSCCustomer',@NumberOfRows,@NewID OUT
SET @RangeStart = @NewID + 1
TRUNCATE TABLE #tmpImported
WHILE 1=1
BEGIN
;WITH cteImported AS(
SELECT ImportedRowID from #tmpImported
)
UPDATE TOP(@BatchRows) i
SET @NewID = CustomerID = @NewID + 1
OUTPUT inserted.ImportRowID INTO #tmpImported
FROM SSCImport i
LEFT JOIN cteImported ON (cteImported.ImportedRowID = i.ImportRowID)
WHERE CustomerID IS NULL
SET @RowCount = @@ROWCOUNT
IF @RowCount < @BatchRows BREAK
END
SET @RangeEnd = @NewID
--insert new customers
TRUNCATE TABLE #tmpImported
WHILE 1=1
BEGIN
BEGIN TRAN
SET IDENTITY_INSERT SSCCustomer ON
;WITH cteImported AS(
SELECT ImportedRowID from #tmpImported
)
INSERT TOP (@BatchRows) SSCCustomer(CustomerID, Name,Number,ImportRowID,ImportID)
OUTPUT inserted.ImportRowID INTO #tmpImported
SELECT CustomerID, Name,Number,ImportRowID,@ImportID
FROM SSCImport i
LEFT JOIN cteImported ON (cteImported.ImportedRowID = i.ImportRowID)
WHERE cteImported.ImportedRowID IS NULL
AND i.CustomerID BETWEEN @RangeStart AND @RangeEnd
AND i.ImportID = @ImportID
SET @RowCount = @@ROWCOUNT
SET IDENTITY_INSERT SSCCustomer OFF
COMMIT TRAN
IF @RowCount < @BatchRows BREAK
END
--get number of new call log rows to insert
SELECT @NumberOfRows = COUNT(*)
FROM SSCImport
WHERE CallLogID IS NULL
--update import table with ids just reserved
EXEC dbo.ReserveIDs 'SSCCallLog',@NumberOfRows,@NewID OUT
SET @RangeStart = @NewID + 1
TRUNCATE TABLE #tmpImported
WHILE 1=1
BEGIN
;WITH cteImported AS(
SELECT ImportedRowID from #tmpImported
)
UPDATE TOP(@BatchRows) i
SET @NewID = CallLogID = @NewID + 1
OUTPUT inserted.ImportRowID INTO #tmpImported
FROM SSCImport i
LEFT JOIN cteImported ON (cteImported.ImportedRowID = i.ImportRowID)
WHERE CallLogID IS NULL
SET @RowCount = @@ROWCOUNT
IF @RowCount < @BatchRows BREAK
END
SET @RangeEnd = @NewID
--insert new call log data
TRUNCATE TABLE #tmpImported
WHILE 1=1
BEGIN
BEGIN TRAN
SET IDENTITY_INSERT SSCCallLog ON
;WITH cteImported AS(
SELECT ImportedRowID from #tmpImported
)
INSERT TOP (@BatchRows) SSCCallLog(CallLogID,CallAgentID,CustomerID,CallTime,ImportRowID,ImportID)
OUTPUT inserted.ImportRowID INTO #tmpImported
SELECT CallLogID,CallAgentID,CustomerID,CallTime,ImportRowID,@ImportID
FROM SSCImport i
LEFT JOIN cteImported ON (cteImported.ImportedRowID = i.ImportRowID)
WHERE cteImported.ImportedRowID IS NULL
AND i.CallLogID BETWEEN @RangeStart AND @RangeEnd
AND i.ImportID = @ImportID
SET @RowCount = @@ROWCOUNT
SET IDENTITY_INSERT SSCCallLog OFF
COMMIT TRAN
IF @RowCount < @BatchRows BREAK
END
-- ...update 18 more tables
END TRY
BEGIN CATCH
IF XACT_STATE() <> 0 ROLLBACK TRAN
DELETE FROM SSCCustomer WHERE ImportID = @ImportID
DELETE FROM SSCCallLog WHERE ImportID = @ImportID
END CATCH
SELECT * FROM SSCImport
SELECT * FROM SSCCustomer
SELECT * FROM SSCCallLog
DROP TABLE SSCImport,#tmpImported,SSCCustomer,SSCCallLog
DROP FUNCTION dbo.GetExistingCustomerID
DROP PROC dbo.ReserveIDs
(comments are locked)
|
|
I use a sequence table for this. See This SSC Thread for an example script (in the context of the SSC thread).
(comments are locked)
|


Have you considered bcp or SSIS?