---@diagnostic disable
-- lua-uni-words.lua
-- Copyright 2020--2023 Marcel Krüger
--
-- This work may be distributed and/or modified under the
-- conditions of the LaTeX Project Public License, either version 1.3
-- of this license or (at your option) any later version.
-- The latest version of this license is in
--   http://www.latex-project.org/lppl.txt
-- and version 1.3 or later is part of all distributions of LaTeX
-- version 2005/12/01 or later.
--
-- This work has the LPPL maintenance status `maintained'.
--
-- The Current Maintainer of this work is Marcel Krüger

local extended_pictographic, property do
  local p = require'lua-uni-parse'
  local l = lpeg or require'lpeg'

  extended_pictographic = p.parse_file('emoji-data',
    l.Cg(p.fields(p.codepoint_range, 'Extended_Pictographic' * l.Cc(true))) + p.ignore_line,
    p.multiset)
  if not extended_pictographic then
    error[[Break Property matching failed]]
  end

  property = p.parse_file('WordBreakProperty',
    l.Cg(p.fields(p.codepoint_range, l.C(l.R('az', 'AZ', '__')^1))) + p.ignore_line,
    p.multiset)
  if not property then
    error[[Break Property matching failed]]
  end
end

local ignorable = { Extend = true, Format = true, ZWJ = true, }
local controls = { CR = true, LF = true, Newline = true, }

local function context_AHLetter_Mid(cp)
  local prop = property[cp]
  if ignorable[prop] then
    return nil, context_AHLetter_Mid
  end
  if prop == 'ALetter' then
    return false, 'ASTARTED'
  end
  if prop == 'Hebrew_Letter' then
    return false, 'HSTARTED'
  end
  return true, 'PRE'
end

local function context_HLetter_Double(cp)
  local prop = property[cp]
  if ignorable[prop] then
    return nil, context_HLetter_Double
  end
  if prop == 'Hebrew_Letter' then
    return false, 'HSTARTED'
  end
  return true, 'PRE'
end

local function context_Numeric_Mid(cp)
  local prop = property[cp]
  if ignorable[prop] then
    return nil, context_Numeric_Mid
  end
  if prop == 'Numeric' then
    return false, 'NSTARTED'
  end
  return true, 'PRE'
end

local state_map state_map = {
  START = function(prop)
    if prop == 'CR' then
      return 'CR', true
    end
    if prop == 'LF' or prop == 'Newline' then
      return 'START', true
    end
    return state_map.PRE(prop), true
  end,
  PRE = function(prop)
    if controls[prop] then
      return state_map.START(prop)
    end
    if ignorable[prop] then
      return 'PRE', false
    end
    if prop == 'WSegSpace' then
      return 'WHITE', true
    end
    if prop == 'ALetter' then
      return 'ASTARTED', true
    end
    if prop == 'Hebrew_Letter' then
      return 'HSTARTED', true
    end
    if prop == 'Numeric' then
      return 'NSTARTED', true
    end
    if prop == 'Katakana' then
      return 'KSTARTED', true
    end
    if prop == 'ExtendNumLet' then
      return 'EXTEND', true
    end
    if prop == 'Regional_Indicator' then
      return 'RI', true
    end
    return 'PRE', true
  end,
  CR = function(prop)
    if prop == 'LF' then
      return 'START', false
    else
      return state_map.START(prop)
    end
  end,
  WHITE = function(prop)
    if prop == 'WSegSpace' then
      return 'WHITE', false
    else
      return state_map.PRE(prop)
    end
  end,
  EXTEND = function(prop)
    if ignorable[prop] then
      return 'EXTEND', false
    end
    if prop == 'ALetter' then
      return 'ASTARTED', false
    end
    if prop == 'Hebrew_Letter' then
      return 'HSTARTED', false
    end
    if prop == 'Katakana' then
      return 'KSTARTED', false
    end
    if prop == 'Numeric' then
      return 'NSTARTED', false
    end
    if prop == 'ExtendNumLet' then
      return 'EXTEND', false
    end
    return state_map.PRE(prop)
  end,
  KSTARTED = function(prop)
    if ignorable[prop] then
      return 'KSTARTED', false
    end
    if prop == 'Katakana' then
      return 'Katakana', false
    end
    if prop == 'ExtendNumLet' then
      return 'EXTEND', false
    end
    return state_map.PRE(prop)
  end,
  RI = function(prop)
    if ignorable[prop] then
      return 'RI', false
    end
    if prop == 'Regional_Indicator' then
      return 'PRE', false
    end
    return state_map.PRE(prop)
  end,
  ASTARTED = function(prop)
    if ignorable[prop] then
      return 'ASTARTED', false
    end
    if prop == 'ALetter' then
      return 'ASTARTED', false
    end
    if prop == 'Hebrew_Letter' then
      return 'HSTARTED', false
    end
    if prop == 'Numeric' then
      return 'NSTARTED', false
    end
    if prop == 'ExtendNumLet' then
      return 'EXTEND', false
    end
    if prop == 'MidLetter' or prop == 'MidNumLet' or prop == 'Single_Quote' then
      return context_AHLetter_Mid
    end
    return state_map.PRE(prop)
  end,
  HSTARTED = function(prop)
    if ignorable[prop] then
      return 'HSTARTED', false
    end
    if prop == 'ALetter' then
      return 'ASTARTED', false
    end
    if prop == 'Hebrew_Letter' then
      return 'HSTARTED', false
    end
    if prop == 'Numeric' then
      return 'NSTARTED', false
    end
    if prop == 'ExtendNumLet' then
      return 'EXTEND', false
    end
    if prop == 'Single_Quote' then
      return 'HSINGLE_QUOTE', false
    end
    if prop == 'MidLetter' or prop == 'MidNumLet' then
      return context_AHLetter_Mid
    end
    if prop == 'Double_Quote' then
      return context_HLetter_Double
    end
    return state_map.PRE(prop)
  end,
  HSINGLE_QUOTE = function(prop)
    if ignorable[prop] then
      return 'HSINGLE_QUOTE', false
    end
    if prop == 'ALetter' then
      return 'ASTARTED', false
    end
    if prop == 'Hebrew_Letter' then
      return 'HSTARTED', false
    end
    return state_map.PRE(prop)
  end,
  NSTARTED = function(prop)
    if ignorable[prop] then
      return 'NSTARTED', false
    end
    if prop == 'ALetter' then
      return 'ASTARTED', false
    end
    if prop == 'Hebrew_Letter' then
      return 'HSTARTED', false
    end
    if prop == 'Numeric' then
      return 'NSTARTED', false
    end
    if prop == 'ExtendNumLet' then
      return 'EXTEND', false
    end
    if prop == 'MidNum' or prop == 'MidNumLet' or prop == 'Single_Quote' then
      return context_Numeric_Mid
    end
    return state_map.PRE(prop)
  end,
}

local from_ZWJ, to_ZWJ = {}, {}
for k in next, state_map do
  local zwj_state = 'ZWJ_' .. k
  from_ZWJ[zwj_state], to_ZWJ[k] = k, zwj_state
end


-- The value of "state" is considered internal and should not be relied upon.
-- Just pass it to the function as is or pass nil. `nil` should only be passed when the passed codepoint starts a new cluster
local function read_codepoint(cp, state)
  local mapped_state = from_ZWJ[state]
  local new_word
  local prop = property[cp]
  state, new_word = state_map[mapped_state or state or 'START'](prop)
  if mapped_state and extended_pictographic[cp] then
    new_word = false
  end
  if prop == 'ZWJ' then
    state = to_ZWJ[state]
  end
  return new_word, state
end

-- A Lua iterator for strings -- Only reporting the beginning of every word segment
local function word_boundaries_start(str)
  local nextcode, str, i = utf8.codes(str)
  local state = "START"
  local saved_i, saved_code
  return function()
    local new_word, code
    repeat
      i, code = nextcode(str, i)
      if saved_i then
        new_word, state = state(code)
        if new_word ~= nil then
          i, code, saved_i, saved_code = saved_i, saved_code, nil, nil
        end
      else
        if not i then return end
        new_word, state = read_codepoint(code, state)
        if new_word == nil then
          saved_i, saved_code = i, code
        end
      end
    until new_word
    return i, code
  end
end
-- A more useful iterator: returns the byterange of the segment in reverse order followed by a string with the word
local function word_boundaries(str)
  local iter = word_boundaries_start(str)
  return function(_, cur)
    if cur == #str then return end
    local new = iter()
    if not new then return #str, cur + 1, str:sub(cur + 1) end
    return new - 1, cur + 1, str:sub(cur + 1, new - 1)
  end, nil, iter() - 1
end
return {
  read_codepoint = read_codepoint,
  word_boundaries_start = word_bounaries_start,
  word_boundaries = word_boundaries,
}