Jump to content

Module:Str find word/sandbox

From Wikipedia, the free encyclopedia

-- 2023-04-17 STABLE wrt basics, quotes "" '' * with base sep; working on resltstring & report
-- todo: report options, more options
-- todo: options count, pattern, out-table, out-htmllist, keepinputordersource
require('strict')
local p	= {}
local mArgs		= require('Module:Arguments')
local str		= require('Module:String')
local yesno 	= require('Module:Yesno')
local tTools 	= require('Module:TableTools')
local strDeEnCode = require('Module:DecodeEncode')
local iMaxWords	= 12 -- alpha-status, Apr2023. when stable, can be higher
local tArgs		= {}
local report	= nil -- initinated when explain=T

local function parseReportType( tArgs )
local xpReportTF = false
local xpReportType = yesno( tArgs.explain, tArgs.explain ) or false -- to be parsed beyond T/F
-- in: nil, false: FALSE type=nil
-- in: true, preview: type=true TRUE (dflt: if prev)
-- in: doc, testcases: by page TRUE (persistent on those pages)
-- in: foo, other: FALSE

	xpReportTF = false
	if yesno( xpReportType, false ) == nil then -- nil, false
	elseif xpReportType == 'testcases' then
		xpReportType = 'testcases'
		xpReportTF = true
	elseif xpReportType == 'doc' then
		xpReportType = 'doc'
		xpReportTF = true
	elseif xpReportType == true then
		xpReportType = 'preview'
		xpReportTF = true
	else
		xpReportTF = false -- unk word
	end
	tArgs.explain = xpReportTF
	return xpReportType

end

local function initReport( tArgs )
	report = require('Module:Str find word/report')
	report.xpCheckExplain() -- dummy
end

local function isPreview( ) -- here or in report?
local ifPreview = require('Module:If preview')
	-- return not ( ifPreview._warning( {'is_preview'} ) == '' )
	return ifPreview.main( true, false )
end

-- Turn "A" into "A" etc. asap
-- and reduce multi-spaces (including nbsp etc.) into single space
local function sDecodeTrim( str )
	if str == nil then return nil end
	str = mw.ustring.gsub( strDeEnCode._decode( str ), '%s+' , ' ' )
	return mw.text.trim( str )
end

-- %-Escape any word (character string) before feeding it into a string pattern function
-- will be %-escaped: "([%(%)%.%%%+%-%*%?%[%^%$%]])" = 12 characters ().%+-*?[^$]
local function escape_word( word )
	return str._escapePattern( word )
end

-- remove \' \" outer pair (& rm outer spaces);
-- any result (=the inner string) is trimmed by T/F option (case " abc ").
local function removeOuterQuotes( s, bTrimAfter )
	if s == nil then return nil end

	if mw.ustring.match( s, "^%s*\'" ) ~= nil then
		s = mw.ustring.gsub( s, "^%s*%\'(.*)%\'%s*$", "%1" )
	elseif mw.ustring.match( s, '^%s*\"' ) ~= '' then
		s = mw.ustring.gsub( mw.text.trim( s ), '^%\"(.*)%\"$', '%1' )
	end
	if bTrimAfter == true then
		s = mw.text.trim( s )
	end
	return s
end

-- separator-in
-- todo: check characters '" _ {}(); & accept?'
local function setSepIn( sSep, sDefaultSep )
	if sSep == nil then	return sDecodeTrim( sDefaultSep ) end
	-- remove all %w (alphanumeric) and %s (WS)
	sSep = mw.ustring.gsub( sDecodeTrim( sSep ), '[%w%s]*', '' ) or ''
	if sSep == '' then
		return sDecodeTrim( sDefaultSep )
	else
		return sSep
	end
end

-- separator
local function setSepOut( sSep, sDefaultSep )
	sSep = sDecodeTrim( sSep ) or nil
	if sSep == nil then return sDefaultSep end
	sSep = removeOuterQuotes( sSep, false )
	if sSep == '' then
		return sDefaultSep
	else
		return sSep
	end
end

-- Check whether a single word is in a table (simple array of words)
-- returns hitword or nil; iPosition is helper to keep outlist ordered
local function findWordInTable( tSource, word )
---local bHit = false
---local iPosition = -1
	for i, v in ipairs( tSource ) do
		if v == word then
		---	bHit = true --- del todo
			---iPosition = i
			return word
			--- break
		end
	end

	return nil
end

-- Reads and parses a word list and returns a table with words (simple array)
-- words list can be: source, andwords-to-check, orwords-to-check
-- step 1: basic preparation of the csv wordstring
-- step 2: when case-insensitive, turn string into lowercase
-- step 3: read (parse) quoted '..'
-- step 4: read (parse) quoted ".."
-- step 5: read (parse) comma-separated words
-- step 6: merge quoted wordlists; keep in order
-- step 7: when booleans=T, change boolean words into true/false (module:yesno rules)
-- step 8: replace synonyms (by inout "|_nov=November, 11" input)
-- step 9: remove duplicates from wordtable (rm latest)
-- 		all words returned are trimmed
-- return the table (a straight array)
local function buildWordTable( sWordlist )
local wordTable = {}
local hitWord	= ''
local hitCount	= -1
local _
local sPattern
local cQ1 = '_Q0027_' -- U+0027 = \'
local cQ2 = '_Q0022_' -- U+0022 = \"
local tQ1hits	= {} -- Q1-hits, reused to restore order
local tQ2hits	= {} -- Q2-hits, reused to restore order
local sMsg = '' -- xpmessage only
local xpHasQuotes = false

	-- Step 1: prepare sWordList
	sDecodeTrim( sWordlist )
	if sWordlist == '' or sWordlist == nil then return wordTable end
	sWordlist = tArgs.sep .. sWordlist .. tArgs.sep
	-- test. dev only:
	xpHasQuotes = mw.ustring.match( sWordlist, '[\"\']' ) ~= '' -- unused
	if xpHasQuotes then 
		--- report.xpMessage( 'xpHasQuotes [unused]: ' .. tostring( xpHasQuotes ) )
	end

	-- Step 2: case sensitive
	if yesno( tArgs.case, true ) == false then
		sWordlist = string.lower( sWordlist )
	end

	-- Step 3: Q1 read quotes (single quotes '..')
	sPattern = '%f[^' .. tArgs.sep_pattern .. ']%s*%b\'\'%s*%f[' .. tArgs.sep_pattern .. ']'
	-- initial: 
	hitWord = sDecodeTrim( mw.ustring.match( sWordlist, sPattern ) ) or ''
	while hitWord ~= '' do
		--- now into function/ to check if both \' and \" are not mixed
		---	hitWord = sDecodeTrim( mw.ustring.gsub( hitWord, "^%\'(.+)%\'$", "%1" ) ) -- remove outer Qs \"
		hitWord = removeOuterQuotes( hitWord, true )
		table.insert( tQ1hits, hitWord )
		sWordlist = mw.ustring.gsub( sWordlist, sPattern, cQ1, 1 ) -- removes current 1st hit; replace with code

		-- next
		hitWord = sDecodeTrim( mw.ustring.match( sWordlist, sPattern ) ) or ''
	end
	---	report.xpMessage( 'sWL1: ' .. sWordlist )
	---	report.xpMessage( 'Qhits: ' .. table.concat( tQ1hits, '; ' ) )
	
	-- Step 4: Q2 read quotes (double quotes "..")
	sPattern = '%f[^' .. tArgs.sep_pattern .. ']%s*%b\"\"%s*%f[' .. tArgs.sep_pattern .. ']'
	-- initial search
	hitWord = sDecodeTrim( mw.ustring.match( sWordlist, sPattern ) ) or ''
	while hitWord ~= '' do
		--- hitWord = sDecodeTrim( mw.ustring.gsub( hitWord, '^%\"(.+)%\"$', '%1' ) ) -- remove outer Qs \"
		hitWord = removeOuterQuotes( hitWord, true )
		table.insert( tQ2hits, hitWord )
		sWordlist = mw.ustring.gsub( sWordlist, sPattern, cQ2, 1 ) -- removes current '1st' hit; replace with code
		-- next
		hitWord = sDecodeTrim( mw.ustring.match( sWordlist, sPattern ) ) or ''
	end
	---report.xpMessage( 'sWL2:' .. sWordlist )
	---report.xpMessage( 'Qhits: ' .. table.concat( tQ2hits, '; ' ) )

	-- Step 5: parse plain sep-delimited words
	sPattern = '%f[^' .. tArgs.sep_pattern .. '][^' .. tArgs.sep_pattern .. ']+%f[' .. tArgs.sep_pattern .. ']'
	hitCount = 0
	while hitCount < iMaxWords do
		hitWord = sDecodeTrim( str._match( sWordlist, sPattern, 1, hitCount + 1, false, tArgs.sep ) ) or ''
		
		if hitWord == sDecodeTrim(tArgs.sep) then
			-- no more words found in the string
			break
		elseif hitWord ~= '' then
			hitCount = hitCount + 1
			table.insert( wordTable, hitWord )
		else -- blank word, to skip (note: but blank quotes as in .., " ", ..are kept = blank dcell '')
			hitCount = hitCount + 1
		end
	end
	if hitCount >= iMaxWords then report.xpMessage( 'ERR701 wordcount ' .. hitCount .. ' > maxwords' .. iMaxWords ) end

	-- Step 6: merge quoted words & wordtable, keep order
	for iQ, sQW in ipairs( tQ1hits ) do
		for iW, sW in ipairs( wordTable ) do
			if sW == cQ1 then
				wordTable[iW] = sQW
				break
			end
		end
	end
	for iQ, sQW in ipairs( tQ2hits ) do
		for iW, sW in ipairs( wordTable ) do
			if sW == cQ2 then
				wordTable[iW] = sQW
				break
			end
		end
	end

	-- Step 7: when read as booleans, converse words to true/false
	if tArgs.booleans then
		local sBool
		for i, v in ipairs( wordTable ) do
			sBool = yesno( v )
			if sBool ~= nil then
				wordTable[i] = tostring( sBool )
			end
		end
	end

	-- Step 8: replace synonyms
	if #tArgs['synonymsTables'] >= 1 then
		for aka1, tAkas in pairs ( tArgs['synonymsTables'] ) do
			for iW, w in ipairs( wordTable ) do
				if findWordInTable( tAkas, w ) then -- todo must be ... ~= nil ??? 26-3
					wordTable[iW] = aka1
				end
			end
		end
	end

if true then
	wordTable = tTools.removeDuplicates( wordTable )
else -- lol works but not needed, use ttools
	-- Step 9: remove duplicates from list
	local iR, iK -- iR = reader, iK = killer
	local hit = false
	iR = 1
	while iR < #wordTable do
		iK = #wordTable -- will be counting downwards
		while iK > iR do
			if wordTable[iK] == wordTable[iR] then
				hit	= true
				sMsg = sMsg .. '=syn=' .. wordTable[iK]
				table.remove( wordTable, iK )
				tTools.compressSparseArray( wordTable )
			end
			iK = iK - 1
		end
		tTools.compressSparseArray( wordTable )
		iR = iR + 1
	end
end	

	return wordTable
end

-- AND-logic with ANDwords words: ALL words must be found
-- returns {T/F, hittable}
-- 		T when *all* AND words are found
-- 		hittable with all hit words
-- note 1: when F, the hittable still contains the words that were found
-- note 2: empty AND-wordlist => True by logic (because: not falsified)
local function checkANDwords( tWorkf )
local bANDchk	= true -- main conclusion
local result1 = nil -- per word hit
local tHits	= {} -- hit table
---local iPos	= -1 -- helper info just to keep in order

	if #tWorkf.ANDwords > 0 then
		bANDchk = true
		for i, word in ipairs( tWorkf.ANDwords ) do
			result1 = findWordInTable( tWorkf.SOURCEwords, word ) or nil
			if result1 == nil then
				bANDchk = false -- Falsified!
				-- We could break now logically, but we continue to complete the hit table (feature)
				-- bAND remains false till & at end of loop
			else
				table.insert( tHits, result1 )
			end
		end
	else
		bANDchk = true -- not falsified
	end
	tTools.compressSparseArray( tHits )
	return bANDchk, tHits
end

-- OR-logic with tORwords words: at least one word must be found
-- returns {T/F, hittable}
-- 		True when at least one OR word is found
-- 		hittable has all hit words
-- note 1: empty OR-wordlist => True by logic (because: not falsified)
-- note 2: while just one hitword is a True result, the hittable contains all words found
local function checkORwords( tWork )
local result1
local bORchk
local tHits

	bORchk = false
	tHits = {}
	result1 = nil
	if #tWork.ORwords > 0 then
		for i, word in ipairs( tWork.ORwords ) do
			result1 = findWordInTable( tWork.SOURCEwords, word ) or nil
			if result1 == nil then
				-- this one is false; bOR unchanged; do next
			else
				bORchk = true -- Confirmed!
				table.insert( tHits, result1 )
				-- could break here logically, but complete the check; bOR will not be set to False 
			end
		end
	else
		bORchk = true
	end
	tTools.compressSparseArray( tHits )
	return bORchk, tHits
end

-- Determine the requested return value (a string)
-- sRESULTstring is the _main return value (logically defined value)
-- this function applies tArgs.out_true / tArgs.out_false return value
-- note: out_true='' implies: blank return value
-- note: no parameter out_true= (that is, out_true=nil) implies: by default, return the sRESULTstring
--- todo add pref, suff
local function yesnoReturnstring( tResults )
	if tResults.resultALL == false then -- result False 
		return tArgs.out_false or ''
	else -- result True
		if tArgs.out_true == nil then
			return table.concat( tResults.tTRUE, tArgs.out_sep ) 
		else -- some |out-true= value is entered, could be ''
			return '_out-true' .. tArgs.out_true
		end
	end
end

local function tCombinedSourceorderedTRUEtables( tResult )
local tOut = {}
	if tResult.tANDhits == nil then
		tOut = tResult.tORhits
	elseif tResult.tORhits == nil then
		tOut = tResult.tANDhits
	else
		tOut = tResult.tANDhits
		for i, v in ipairs( tResult.tORhits ) do
			table.insert( tOut, i, v )
		end
	end
	if tOut == nil then 
		report.xpMessage( 'ERR921 BUG tOut is nil??? - tCombinedSourceorderedTRUEtables' )
	end
	return tOut -- unsorted; never nil
end

local function concatAndLists( s1, s2 )
	local tLists = {} -- args in: both s1 and s2 to concat
	table.insert( tLists, s1 )
	table.insert( tLists, s2 )
	return table.concat( tLists, tArgs.sep )
end

-- ===== ===== ===== ===== ===== ===== ===== ===== ===== 
-- PARSE arguments
local function parseArgs( origArgs )
local tNewArgs = {}
local tDefault	= {}
	tDefault['sep']			= ','
	tDefault['case']		= false
	tDefault['booleans']	= false
	tDefault['out_sep']		= ', '

	tNewArgs.sep			= setSepIn( origArgs['sep'], tDefault['sep'] )
	tNewArgs.sep_pattern	= escape_word( tNewArgs.sep )
	tNewArgs.out_sep		= setSepOut( origArgs['out-sep'] or origArgs['sep'], tDefault['out_sep'] )
	tNewArgs.case			= yesno( origArgs['case'] or origArgs['casesensitive'] ) or tDefault['case']
	tNewArgs.booleans		= yesno( origArgs['bool'] or origArgs['booleans'] ) or tDefault['booleans']
	tNewArgs.out_true		= sDecodeTrim( origArgs.out_true ) or nil -- nil =default so return sRESULTstring; keep '' as legal input & return value
	tNewArgs.out_false		= sDecodeTrim( origArgs.out_false ) or ''
	tNewArgs.prefix			= sDecodeTrim( origArgs.prefix or origArgs.p ) or ''
	tNewArgs.suffix			= sDecodeTrim( origArgs.suffix or origArgs.s ) or ''
	tNewArgs.out_format		= 'default' -- todo: table, default, htmllisttype, flatlidt , first, 
	tNewArgs.explain		= false -- TEST17Apr origArgs.explain
	tNewArgs.explain_type	= parseReportType( tNewArgs ) or nil
	tNewArgs.test			= origArgs.test

	-- the wordlists:
	tNewArgs['source']		= origArgs['source'] or origArgs['s'] or ''
	tNewArgs['sANDlist']	= concatAndLists( 
								origArgs['word'] or origArgs['w'] or nil,
								origArgs['andwords'] or origArgs['andw'] or nil )
	tNewArgs['sORlist']		= origArgs['orwords'] or origArgs['orw'] or ''

	tNewArgs['synonyms']		= {}
	tNewArgs['synonymsTables']	= {} -- to be populated later
	for k, v in pairs( origArgs ) do
		if str._match( k, '^_%S', 1, 1, false, false ) then
			local syn1
			syn1 = mw.ustring.gsub( k, '^_', '', 1 )
			table.insert( tNewArgs['synonyms'], syn1 )
			tNewArgs['synonyms'][syn1] = v
		end
	end

	if tNewArgs.explain == true then 
		initReport( tNewArgs.explain )
		report.xpMessage( 'EXPLAIN: ' .. origArgs.explain .. '=>' .. tNewArgs.explain_type or 'unk')
		report.xpReportSynonyms( tNewArgs )
	end

if false then
	for aka1, sAkalist in pairs ( tNewArgs['synonyms'] ) do
		report.xpMessage( 'SYNONYMS: ' .. aka1 .. '=' .. sAkalist )
	end
end

	return tNewArgs
end

-- ===== ===== ===== ===== ===== ===== ===== ===== ===== ===== ===== ===== =====
-- _main function: check for presence of words in source string
-- Checks and returns:
-- 		when T: the string of all hitwords ( default ), or the |yes=... input
-- 		when F: empty string '' ( default ), or the |no=... input
-- steps:
-- 1. input word strings are prepared ( parsed into an array of words )
-- 2. words checks are made ( applying AND-logic, OR-logic )
-- 3. final conclusion drawn ( T/F )
-- 4. optionally, the preview report is prepared ( debug, feedback )
-- 5. based on T or F status, the return value ( string ) is established and returned
-- note 1: each return value ( yes=.., no=.. ) can be '' ( nullstring )
function p._main( origArgs )
local tWork = {}
local tResults = {}

	tArgs = parseArgs( origArgs )
	
	-- make synonyms into tables
	-- 'aka1' = target synonym (= the synonym that remains)
	for aka1, sAkalist in pairs( tArgs['synonyms'] ) do
		tArgs['synonymsTables'][aka1] = buildWordTable( tArgs['synonyms'][aka1] )
	end

	-- build the worktables
	tWork['SOURCEwords']	= buildWordTable( tArgs.source )
	tWork['ANDwords']		= buildWordTable( tArgs.sANDlist )
	tWork['ORwords']		= buildWordTable( tArgs.sORlist )

	-- apply logic & conclude
	tResults.resultALL = nil -- best be set explicitly
	if ( #tWork.SOURCEwords == 0 ) or ( #tWork.ANDwords + #tWork.ORwords == 0 ) then
		-- No words to check
		tResults.resultALL = false
		if yesno( tArgs.explain, true ) then
			report.xpMessage( 'ERR201 No words to check' ) 
		end
	else
		tResults['bAND'], tResults['tANDhits']	= checkANDwords( tWork )
		tResults['bOR'],  tResults['tORhits']	= checkORwords( tWork )
		tResults.resultALL = ( tResults.bAND ) and ( tResults.bOR )
	end

	tResults.sRESULTstring = 'notinit'
	if tResults.resultALL == true then
		tResults.tTRUE = tCombinedSourceorderedTRUEtables( tResults ) or {}
	end
	tResults.sRESULTstring = yesnoReturnstring( tResults )

	local sReport  = ''
	if tArgs.explain then
		sReport = 'xp endfinal Report here L485'
		--sReport = report.xpPresent( tArgs, tWork, tResults )
	end
	
local test = 'Tunk'
test  = tArgs.test or '_unk'

if tArgs.explain then
	test = tostring(tArgs.explain)
else
	test = 'not'
end
	return string.upper( tostring( tResults.resultALL ) )  .. tResults.sRESULTstring
end

function p.main( frame )
local origArgs	= mArgs.getArgs( frame )

	return p._main( origArgs )
end

return p