{-# LANGUAGE ViewPatterns #-}
-- TODO : learn how to use Functional Morphology instead
-- |Simple default rules for English morphology
module NLP.Minimorph.English where

-- Only needed for older GHC, but let's avoid CPP for this instance.
import Data.Monoid ((<>))

import           Data.Char (isSpace, isUpper, toLower)
import           Data.Text (Text)
import qualified Data.Text as T

import NLP.Minimorph.Util

-- ---------------------------------------------------------------------
-- ** Punctuation
-- ---------------------------------------------------------------------

-- | No Oxford commas, alas.
--
-- > commas "and" "foo bar"       == "foo and bar"
-- > commas "and" "foo, bar, baz" == "foo, bar and baz"
commas :: Text -> [Text] -> Text
commas :: Text -> [Text] -> Text
commas _ []  = ""
commas _ [x :: Text
x] = Text
x
commas et :: Text
et xs :: [Text]
xs = Text -> [Text] -> Text
T.intercalate ", " ([Text] -> [Text]
forall a. [a] -> [a]
init [Text]
xs) Text -> Text -> Text
<+> Text
et Text -> Text -> Text
<+> [Text] -> Text
forall a. [a] -> a
last [Text]
xs

-- ---------------------------------------------------------------------
-- ** Numbers
-- ---------------------------------------------------------------------

-- | > cardinal 0 == "zero"
--   > cardinal 1 == "one"
--   > cardinal 2 == "two"
--   > cardinal 10 == "ten"
--   > cardinal 11 == "11"
cardinal :: Int -> Text
cardinal :: Int -> Text
cardinal n :: Int
n = case Int
n of
    0  -> "zero"
    1  -> "one"
    2  -> "two"
    3  -> "three"
    4  -> "four"
    5  -> "five"
    6  -> "six"
    7  -> "seven"
    8  -> "eight"
    9  -> "nine"
    10 -> "ten"
    _ -> Int -> Text
forall a. Show a => a -> Text
tshow Int
n

-- | > ordinalNotSpelled 1 == "1st"
--   > ordinalNotSpelled 2 == "2nd"
--   > ordinalNotSpelled 11 == "11th"
ordinalNotSpelled :: Int -> Text
ordinalNotSpelled :: Int -> Text
ordinalNotSpelled k :: Int
k = case Int -> Int
forall a. Num a => a -> a
abs Int
k Int -> Int -> Int
forall a. Integral a => a -> a -> a
`rem` 100 of
  n :: Int
n | Int
n Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
> 3 Bool -> Bool -> Bool
&& Int
n Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< 21 -> Int
k Int -> Text -> Text
forall a. Show a => a -> Text -> Text
`suf` "th"
    | Int
n Int -> Int -> Int
forall a. Integral a => a -> a -> a
`rem` 10 Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== 1 -> Int
k Int -> Text -> Text
forall a. Show a => a -> Text -> Text
`suf` "st"
    | Int
n Int -> Int -> Int
forall a. Integral a => a -> a -> a
`rem` 10 Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== 2 -> Int
k Int -> Text -> Text
forall a. Show a => a -> Text -> Text
`suf` "nd"
    | Int
n Int -> Int -> Int
forall a. Integral a => a -> a -> a
`rem` 10 Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== 3 -> Int
k Int -> Text -> Text
forall a. Show a => a -> Text -> Text
`suf` "rd"
    | Bool
otherwise       -> Int
k Int -> Text -> Text
forall a. Show a => a -> Text -> Text
`suf` "th"
 where
  num :: a
num suf :: a -> Text -> Text
`suf` s :: Text
s = a -> Text
forall a. Show a => a -> Text
tshow a
num Text -> Text -> Text
forall a. Semigroup a => a -> a -> a
<> Text
s

-- | > ordinal 1 == "first"
--   > ordinal 2 == "second"
--   > ordinal 3 == "third"
--   > ordinal 11 == "11th"
--   > ordinal 42 == "42nd"
ordinal :: Int -> Text
ordinal :: Int -> Text
ordinal n :: Int
n = case Int
n of
    0  -> "zeroth"
    1  -> "first"
    2  -> "second"
    3  -> "third"
    4  -> "fourth"
    5  -> "fifth"
    6  -> "sixth"
    7  -> "seventh"
    8  -> "eighth"
    9  -> "ninth"
    10 -> "tenth"
    k :: Int
k  -> Int -> Text
ordinalNotSpelled Int
k

-- ---------------------------------------------------------------------
-- ** Nouns and verbs
-- ---------------------------------------------------------------------

-- | Heuristics for English plural for an unknown noun.
--
-- > defaultNounPlural "egg"    == "eggs"
-- > defaultNounPlural "patch"  == "patches"
-- > defaultNounPlural "boy"    == "boys"
-- > defaultNounPlural "spy"    == "spies"
-- > defaultNounPlural "thesis" == "theses"
--
-- http://www.paulnoll.com/Books/Clear-English/English-plurals-1.html
--
-- http://en.wikipedia.org/wiki/English_plural
defaultNounPlural :: Text -> Text
defaultNounPlural :: Text -> Text
defaultNounPlural x :: Text
x
    | "is" Text -> Text -> Bool
`T.isSuffixOf` Text
x = Text
thesis
    | Text -> Bool
hasSibilantSuffix Text
x   = Text
sibilant_o
    | Text -> Bool
hasCoSuffix Text
x         = Text
sibilant_o
    | Text -> Bool
hasCySuffix Text
x         = Text
y_final
    | "ff" Text -> Text -> Bool
`T.isSuffixOf` Text
x = Text
ff_final  -- quite often not the case
    | "f" Text -> Text -> Bool
`T.isSuffixOf` Text
x  = Text
f_final   -- but this one as well, so both needed
    | Bool
otherwise             = Text
plain
  where
    plain :: Text
plain      = Text
x            Text -> Text -> Text
forall a. Semigroup a => a -> a -> a
<> "s"
    sibilant_o :: Text
sibilant_o = Text
x            Text -> Text -> Text
forall a. Semigroup a => a -> a -> a
<> "es"
    y_final :: Text
y_final    = Text -> Text
T.init Text
x     Text -> Text -> Text
forall a. Semigroup a => a -> a -> a
<> "ies"
    f_final :: Text
f_final    = Text -> Text
T.init Text
x     Text -> Text -> Text
forall a. Semigroup a => a -> a -> a
<> "ves"
    ff_final :: Text
ff_final   = Int -> Text -> Text
T.dropEnd 2 Text
x Text -> Text -> Text
forall a. Semigroup a => a -> a -> a
<> "ves"
    thesis :: Text
thesis     = Int -> Text -> Text
T.dropEnd 2 Text
x Text -> Text -> Text
forall a. Semigroup a => a -> a -> a
<> "es"

-- | Heuristics for 3rd person singular and past participle
--   for an unknown regular verb. Doubling of final consonants
--   can be handled via a table of (partially) irregular verbs.
--
-- > defaultVerbStuff "walk"  == ("walks",  "walked")
-- > defaultVerbStuff "push"  == ("pushes", "pushed")
-- > defaultVerbStuff "play"  == ("plays",  "played")
-- > defaultVerbStuff "cry"   == ("cries",  "cried")
defaultVerbStuff :: Text -> (Text, Text)
defaultVerbStuff :: Text -> (Text, Text)
defaultVerbStuff x :: Text
x
    | Text -> Bool
hasSibilantSuffix Text
x   = (Text, Text)
sibilant_o
    | Text -> Bool
hasCoSuffix Text
x         = (Text, Text)
sibilant_o
    | Text -> Bool
hasCySuffix Text
x         = (Text, Text)
y_final
    | "e" Text -> Text -> Bool
`T.isSuffixOf` Text
x  = (Text, Text)
e_final
    | Bool
otherwise             = (Text, Text)
plain
  where
    plain :: (Text, Text)
plain      = (Text
x Text -> Text -> Text
forall a. Semigroup a => a -> a -> a
<> "s"         , Text
x Text -> Text -> Text
forall a. Semigroup a => a -> a -> a
<> "ed")
    sibilant_o :: (Text, Text)
sibilant_o = (Text
x Text -> Text -> Text
forall a. Semigroup a => a -> a -> a
<> "es"        , Text
x Text -> Text -> Text
forall a. Semigroup a => a -> a -> a
<> "ed")
    e_final :: (Text, Text)
e_final    = (Text
x Text -> Text -> Text
forall a. Semigroup a => a -> a -> a
<> "s"         , Text
x Text -> Text -> Text
forall a. Semigroup a => a -> a -> a
<> "d")
    y_final :: (Text, Text)
y_final    = (Text -> Text
T.init Text
x Text -> Text -> Text
forall a. Semigroup a => a -> a -> a
<> "ies", Text -> Text
T.init Text
x Text -> Text -> Text
forall a. Semigroup a => a -> a -> a
<> "ied")

-- | Heuristics for a possesive form for an unknown noun.
--
-- > defaultPossesive "pass"        == "pass'"
-- > defaultPossesive "SOS"         == "SOS'"
-- > defaultPossesive "Mr Blinkin'" == "Mr Blinkin's"
-- > defaultPossesive "cry"         == "cry's"
defaultPossesive :: Text -> Text
defaultPossesive :: Text -> Text
defaultPossesive t :: Text
t =
  case Text -> Char
T.last Text
t of
    's'  -> Text
t Text -> Text -> Text
forall a. Semigroup a => a -> a -> a
<> "'"
    'S'  -> Text
t Text -> Text -> Text
forall a. Semigroup a => a -> a -> a
<> "'"
    '\'' -> Text
t Text -> Text -> Text
forall a. Semigroup a => a -> a -> a
<> "s"
    _    -> Text
t Text -> Text -> Text
forall a. Semigroup a => a -> a -> a
<> "'s"

-- ---------------------------------------------------------------------
-- ** Determiners
-- ---------------------------------------------------------------------

anNumerals :: [Text]
anNumerals :: [Text]
anNumerals = [ "11", "11th", "18", "18th" ]

-- | > indefiniteDet "dog"  == "a"
--   > indefiniteDet "egg"  == "an"
--   > indefiniteDet "ewe"  == "a"
--   > indefiniteDet "ewok" == "an"
--   > indefiniteDet "8th"  == "an"
indefiniteDet :: Text -> Text
indefiniteDet :: Text -> Text
indefiniteDet t :: Text
t = if Text -> Bool
wantsAn Text
t then "an" else "a"

-- | True if the indefinite determiner for a word would normally be
--   \'an\' as opposed to \'a\'.
wantsAn :: Text -> Bool
wantsAn :: Text -> Bool
wantsAn t_ :: Text
t_ =
    if Text -> Bool
startsWithAcronym Text
t_
       then Text -> Bool
acronymWantsAn Text
t_
       else Bool
useAn0 Bool -> Bool -> Bool
|| Bool
useAn1
  where
    t :: Text
t      = (Text, Text) -> Text
forall a b. (a, b) -> a
fst ((Text, Text) -> Text) -> (Text, Text) -> Text
forall a b. (a -> b) -> a -> b
$ (Char -> Bool) -> Text -> (Text, Text)
T.break Char -> Bool
isSep (Text -> (Text, Text)) -> Text -> (Text, Text)
forall a b. (a -> b) -> a -> b
$ Text -> Text
T.toLower Text
t_
    useAn0 :: Bool
useAn0 = Text
t Text -> [Text] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` [Text]
anNumerals
    useAn1 :: Bool
useAn1 = case Text -> Maybe (Char, Text)
T.uncons Text
t of
                Just (h :: Char
h, "") -> Char -> Bool
isLetterWithInitialVowelSound Char
h
                Just ('8',_) -> Bool
True
                Just ('u',_) -> Text -> Bool
hasVowel_U_Prefix Text
t
                Just (h :: Char
h, _)  -> Char -> Bool
isVowel Char
h Bool -> Bool -> Bool
`butNot` Text -> Bool
hasSemivowelPrefix Text
t
                Nothing      -> Bool
False
    x :: Bool
x butNot :: Bool -> Bool -> Bool
`butNot` y :: Bool
y = Bool
x Bool -> Bool -> Bool
&& Bool -> Bool
not Bool
y
    isSep :: Char -> Bool
isSep c :: Char
c = Char -> Bool
isSpace Char
c Bool -> Bool -> Bool
|| Char
c Char -> [Char] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` ("-" :: String)

-- | Variant of 'wantsAn' that assumes the input string is pronounced
--   one letter at a time.
--
--   > wantsAn        "x-ray" == False
--   > acronymWantsAn "x-ray" == True
--
--   Note that this won't do the right thing for words like \"SCUBA\".
--   You really have to reserve it for those separate-letter acronyms.
acronymWantsAn :: Text -> Bool
acronymWantsAn :: Text -> Bool
acronymWantsAn (Text -> Text
T.toLower -> Text
t) =
    Bool
useAn0 Bool -> Bool -> Bool
|| Bool
useAn1
  where
    useAn0 :: Bool
useAn0 = Text
t Text -> [Text] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` [Text]
anNumerals
    useAn1 :: Bool
useAn1 = case Text -> Maybe (Char, Text)
T.uncons Text
t of
                Just ('8',_) -> Bool
True
                Just (h :: Char
h,_)   -> Char -> Bool
isLetterWithInitialVowelSound Char
h
                Nothing      -> Bool
False

-- ---------------------------------------------------------------------
-- ** Acronyms
-- ---------------------------------------------------------------------

-- | True if all upper case from second letter and up.
--
--   > looksLikeAcronym "DNA"  == True
--   > looksLikeAcronym "tRNA" == True
--   > looksLikeAcronym "x"    == False
--   > looksLikeAcronym "DnA"  == False
looksLikeAcronym :: Text -> Bool
looksLikeAcronym :: Text -> Bool
looksLikeAcronym "" = Bool
False
looksLikeAcronym x :: Text
x = (Char -> Bool) -> Text -> Bool
T.all Char -> Bool
isUpper (if Text -> Int
T.length Text
x Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
> 1 then Int -> Text -> Text
T.drop 1 Text
x else Text
x)

-- | True if the first word (separating on either hyphen or space)
--   looks like an acronym.
startsWithAcronym :: Text -> Bool
startsWithAcronym :: Text -> Bool
startsWithAcronym =
    Text -> Bool
looksLikeAcronym (Text -> Bool) -> (Text -> Text) -> Text -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Text -> Text
firstWord
  where
    firstWord :: Text -> Text
firstWord = (Text, Text) -> Text
forall a b. (a, b) -> a
fst ((Text, Text) -> Text) -> (Text -> (Text, Text)) -> Text -> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Char -> Bool) -> Text -> (Text, Text)
T.break Char -> Bool
isSep
    isSep :: Char -> Bool
isSep c :: Char
c   = Char -> Bool
isSpace Char
c Bool -> Bool -> Bool
|| Char
c Char -> [Char] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` ("-" :: String)

-- ---------------------------------------------------------------------
-- ** Sounds
-- ---------------------------------------------------------------------

-- | Ends with a \'sh\' sound.
hasSibilantSuffix :: Text -> Bool
hasSibilantSuffix :: Text -> Bool
hasSibilantSuffix x :: Text
x = (Text -> Bool) -> [Text] -> Bool
forall (t :: * -> *) a. Foldable t => (a -> Bool) -> t a -> Bool
any (Text -> Text -> Bool
`T.isSuffixOf` Text
x) ["x","s","ch","sh","z","j"]

-- | Starts with a semivowel.
hasSemivowelPrefix :: Text -> Bool
hasSemivowelPrefix :: Text -> Bool
hasSemivowelPrefix ls :: Text
ls = (Text -> Bool) -> [Text] -> Bool
forall (t :: * -> *) a. Foldable t => (a -> Bool) -> t a -> Bool
any (Text -> Text -> Bool
`T.isPrefixOf` Text
ls) ["y","w","eu","ewe"]

-- | Starts with a vowel-y \'U\' sound
hasVowel_U_Prefix :: Text -> Bool
hasVowel_U_Prefix :: Text -> Bool
hasVowel_U_Prefix t :: Text
t =
    case Text -> [Char]
T.unpack Text
t of
        ['u']       -> Bool
False
        ['u',_]     -> Bool
True
        ('u':c :: Char
c:v :: Char
v:_) -> Bool -> Bool
not (Char -> Bool
isConsonant Char
c Bool -> Bool -> Bool
&& Char -> Bool
isVowel Char
v)
        _           -> Bool
False

-- | Last two letters are a consonant and \'y\'.
hasCySuffix :: Text -> Bool
hasCySuffix :: Text -> Bool
hasCySuffix (Text -> [Char]
T.unpack (Text -> [Char]) -> (Text -> Text) -> Text -> [Char]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> Text -> Text
T.takeEnd 2 -> [x :: Char
x, 'y']) = Char -> Bool
isConsonant Char
x
hasCySuffix _ = Bool
False

-- | Last two letters are a consonant and \'o\'.
hasCoSuffix :: Text -> Bool
hasCoSuffix :: Text -> Bool
hasCoSuffix (Text -> [Char]
T.unpack (Text -> [Char]) -> (Text -> Text) -> Text -> [Char]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> Text -> Text
T.takeEnd 2 -> [x :: Char
x, 'o']) = Char -> Bool
isConsonant Char
x
hasCoSuffix _ = Bool
False

-- | Is a vowel.
isVowel :: Char -> Bool
isVowel :: Char -> Bool
isVowel = (Char -> [Char] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` ("aeiou" :: String)) (Char -> Bool) -> (Char -> Char) -> Char -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Char -> Char
toLower

-- | Letters that when pronounced independently in English sound like they
--   begin with vowels.
--
--   > isLetterWithInitialVowelSound 'r' == True
--   > isLetterWithInitialVowelSound 'k' == False
--
--   (In the above, \'r\' is pronounced \"are\", but \'k\' is pronounced
--   \"kay\".)
isLetterWithInitialVowelSound :: Char -> Bool
isLetterWithInitialVowelSound :: Char -> Bool
isLetterWithInitialVowelSound = (Char -> [Char] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` ("aeiofhlmnrsx" :: String)) (Char -> Bool) -> (Char -> Char) -> Char -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Char -> Char
toLower

-- | Is a consonant.
isConsonant :: Char -> Bool
isConsonant :: Char -> Bool
isConsonant = Bool -> Bool
not (Bool -> Bool) -> (Char -> Bool) -> Char -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Char -> Bool
isVowel