Facebook Sign in | Join
Getting Started with Adobe After Effects - Part 6: Motion Blur
First Time? You can support us by signing up. It takes only 5 seconds. Click here to sign up. If you already have an account, click here to login.

Solution to TSQL Challenge 34 - Search for two keywords within the maximum distance of one word between them By stefan_g

-- tc34_v3
--
-- Stefan_G

-- Algorithm:
--
-- * Split the varchar(max) strings into varchar(8000) strings using a recursive cte
-- * Split the strings into words
-- * assign each word a number within the string
-- * create one stream of words matching text1 and another stream of words matching text2
-- * join the two streams and filter out the pairs where the distance <= 2

-- v2: Use improved chunking routine
-- v3: Make sure the chunking algorithm works even if some words are more than 8000 chars long

/*
Chunking algorithm:

p1 = index of start of start of chunk (always points to a separator)
p2 = index of separator after this chunk
d = this chunk as varchar(8000) (always starts and ends with a separator)
		If there are no separators within the first 8000 chars in a chunk, we only return the first 52 characters as this chunk
data = the whole varchar(max) phrase

p1=1 
p2=case when charindex(' ',data,1+1)-1>=8000 then charindex(' ',data,1+1) else 1+8000-charindex(' ', reverse(cast(substring(data,1,8000) as varchar(8000)))) end
d=case when charindex(' ',data,1+1)-1>=8000 then cast(substring(data, 1, 52)+' ' as varchar(8000)) else original end
*/

-- Splitting a varchar(max) to words directly with a tally split is too slow.
-- Use a recursive cte to first split the string into a number of varchar(8000) chunks

;with
cte0 as (
	-- add trailing and leading separators to optimize tally split below
	select textid, ' '+data+' ' as data 
	from tc34_phrases with (nolock)
)
, cte1 as (

	-- seed
	select
		textid,
		-- p1=index of start of chunk
		-- p2=index of separator after this chunk
		-- d=this chunk as a varchar(8000). 
		--     Always starts and ends with a separator
		--     If no separators are found within the first 8000 chars this string only returns the first 52 characters of the chunk
		--     52 characters are used so it never matches a search word - search words are max 50 chars
		p1 = cast(1 as int),
		p2 = cast(case when charindex(' ', data, 1+1)-1 >= 8000 
				then charindex(' ', data, 1+1) 
				else 1+8000-charindex(' ', reverse(cast(substring(data,1,8000) as varchar(8000)))) end as int),
		d =  case when charindex(' ', data, 1+1)-1 >= 8000 
				then cast(substring(data, 1, 52)+' ' as varchar(8000))
				else cast(substring(data,1,
					1+8000-charindex(' ', reverse(cast(substring(data,1,8000) as varchar(8000))))
					) as varchar(8000)) end,
		data
	from cte0
	
	union all
	
	-- recursion
	select
		textid,
		-- this chunk starts at the separator used to split off the previous chunk
		p1 = cast(p2 as int),
		p2 = cast(case when charindex(' ', data, p2+1)-p2 >= 8000 
				then charindex(' ', data, p2+1) 
				else p2+8000-charindex(' ', reverse(cast(substring(data,p2,8000) as varchar(8000)))) end as int),
		d =  case when charindex(' ', data, p2+1)-p2 >= 8000 
				then cast(substring(data, p2, 52)+' ' as varchar(8000))
				else cast(substring(data,p2,
					1+8000-charindex(' ', reverse(cast(substring(data,p2,8000) as varchar(8000))))
					) as varchar(8000)) end,
		data
	from cte1
	where datalength(data) > p2
)
--select TextID,p1,p2,datalength(data) as datalen, datalength(d) as dlen,d,data from cte1 order by TextID,p1

-- Use a tally to split the chunks, treat multiple spaces as one
, cte2 as (
	select textid, p1+n as n, substring(d,n+1,charindex(' ',d,n+1)-n-1) as word
	from cte1
	join tsqlc_Tally with (nolock)
		on substring(d,n,1)=' ' and substring(d,n+1,1)>' ' and n>0 and n<len(d)
)
-- add a row_number to indicate the number of each word within each original string
,cte4 as (
	select *, row_number() over(partition by textid order by n) as wordno
	from cte2
)
-- get all keywords matching text1
,search1 as (
	select textid, wordno, searchid, word
	from cte4 a
	inner join tc34_searches b with (nolock)
		on a.word = b.text1
)
-- get all keywords matching text2
,search2 as (
	select textid, wordno, searchid, word
	from cte4 a
	inner join tc34_searches b with (nolock)
		on a.word = b.text2
)
--select * from search2 order by TextID, wordno

-- join the two streams of keywords, find pairs where the distance is <= 2
,cte5 as (
	select s1.textid, s1.searchid, s1.word as word1, s1.wordno as wordno1, s2.word as word2, s2.wordno as wordno2
	from search1 s1
	inner join search2 s2
		on 	s1.searchid = s2.searchid
			and
			s1.textid = s2.textid
	where abs(s1.wordno - s2.wordno) <= 2
)
-- only return one row for each searchid, textid
select distinct searchid, textid
from cte5
order by searchid, textid
-- no parallellism gives less cpu, reads and writes. It increases duration slightly but as a whole it should be better.
option (maxdop 1)

Performance stats of the above solution

Sr# StartTime                 Reads      Writes         CPU Duration 
--- ------------------- ----------- ----------- ----------- -------- 
  1 Dec 10 2010 12:00PM      176056        1417        2745    2.957
  2 Dec 10 2010  1:11PM      176047        1416        2715    3.271
  3 Dec 10 2010  2:22PM      176047        1416        2714    2.976
  4 Dec 10 2010  3:33PM      176047        1416        2777    3.534
  5 Dec 10 2010  4:44PM      176046        1416        2668    3.307

Copyright © Rivera Informatic Private Ltd.