Solution to TSQL Challenge 34 - Search for two keywords within the maximum distance of one word between them By stefan_g
-- tc34_v3
--
-- Stefan_G
-- Algorithm:
--
-- * Split the varchar(max) strings into varchar(8000) strings using a recursive cte
-- * Split the strings into words
-- * assign each word a number within the string
-- * create one stream of words matching text1 and another stream of words matching text2
-- * join the two streams and filter out the pairs where the distance <= 2
-- v2: Use improved chunking routine
-- v3: Make sure the chunking algorithm works even if some words are more than 8000 chars long
/*
Chunking algorithm:
p1 = index of start of start of chunk (always points to a separator)
p2 = index of separator after this chunk
d = this chunk as varchar(8000) (always starts and ends with a separator)
If there are no separators within the first 8000 chars in a chunk, we only return the first 52 characters as this chunk
data = the whole varchar(max) phrase
p1=1
p2=case when charindex(' ',data,1+1)-1>=8000 then charindex(' ',data,1+1) else 1+8000-charindex(' ', reverse(cast(substring(data,1,8000) as varchar(8000)))) end
d=case when charindex(' ',data,1+1)-1>=8000 then cast(substring(data, 1, 52)+' ' as varchar(8000)) else original end
*/
-- Splitting a varchar(max) to words directly with a tally split is too slow.
-- Use a recursive cte to first split the string into a number of varchar(8000) chunks
;with
cte0 as (
-- add trailing and leading separators to optimize tally split below
select textid, ' '+data+' ' as data
from tc34_phrases with (nolock)
)
, cte1 as (
-- seed
select
textid,
-- p1=index of start of chunk
-- p2=index of separator after this chunk
-- d=this chunk as a varchar(8000).
-- Always starts and ends with a separator
-- If no separators are found within the first 8000 chars this string only returns the first 52 characters of the chunk
-- 52 characters are used so it never matches a search word - search words are max 50 chars
p1 = cast(1 as int),
p2 = cast(case when charindex(' ', data, 1+1)-1 >= 8000
then charindex(' ', data, 1+1)
else 1+8000-charindex(' ', reverse(cast(substring(data,1,8000) as varchar(8000)))) end as int),
d = case when charindex(' ', data, 1+1)-1 >= 8000
then cast(substring(data, 1, 52)+' ' as varchar(8000))
else cast(substring(data,1,
1+8000-charindex(' ', reverse(cast(substring(data,1,8000) as varchar(8000))))
) as varchar(8000)) end,
data
from cte0
union all
-- recursion
select
textid,
-- this chunk starts at the separator used to split off the previous chunk
p1 = cast(p2 as int),
p2 = cast(case when charindex(' ', data, p2+1)-p2 >= 8000
then charindex(' ', data, p2+1)
else p2+8000-charindex(' ', reverse(cast(substring(data,p2,8000) as varchar(8000)))) end as int),
d = case when charindex(' ', data, p2+1)-p2 >= 8000
then cast(substring(data, p2, 52)+' ' as varchar(8000))
else cast(substring(data,p2,
1+8000-charindex(' ', reverse(cast(substring(data,p2,8000) as varchar(8000))))
) as varchar(8000)) end,
data
from cte1
where datalength(data) > p2
)
--select TextID,p1,p2,datalength(data) as datalen, datalength(d) as dlen,d,data from cte1 order by TextID,p1
-- Use a tally to split the chunks, treat multiple spaces as one
, cte2 as (
select textid, p1+n as n, substring(d,n+1,charindex(' ',d,n+1)-n-1) as word
from cte1
join tsqlc_Tally with (nolock)
on substring(d,n,1)=' ' and substring(d,n+1,1)>' ' and n>0 and n<len(d)
)
-- add a row_number to indicate the number of each word within each original string
,cte4 as (
select *, row_number() over(partition by textid order by n) as wordno
from cte2
)
-- get all keywords matching text1
,search1 as (
select textid, wordno, searchid, word
from cte4 a
inner join tc34_searches b with (nolock)
on a.word = b.text1
)
-- get all keywords matching text2
,search2 as (
select textid, wordno, searchid, word
from cte4 a
inner join tc34_searches b with (nolock)
on a.word = b.text2
)
--select * from search2 order by TextID, wordno
-- join the two streams of keywords, find pairs where the distance is <= 2
,cte5 as (
select s1.textid, s1.searchid, s1.word as word1, s1.wordno as wordno1, s2.word as word2, s2.wordno as wordno2
from search1 s1
inner join search2 s2
on s1.searchid = s2.searchid
and
s1.textid = s2.textid
where abs(s1.wordno - s2.wordno) <= 2
)
-- only return one row for each searchid, textid
select distinct searchid, textid
from cte5
order by searchid, textid
-- no parallellism gives less cpu, reads and writes. It increases duration slightly but as a whole it should be better.
option (maxdop 1)
Sr# StartTime Reads Writes CPU Duration
--- ------------------- ----------- ----------- ----------- --------
1 Dec 10 2010 12:00PM 176056 1417 2745 2.957
2 Dec 10 2010 1:11PM 176047 1416 2715 3.271
3 Dec 10 2010 2:22PM 176047 1416 2714 2.976
4 Dec 10 2010 3:33PM 176047 1416 2777 3.534
5 Dec 10 2010 4:44PM 176046 1416 2668 3.307