diff --git a/cabal.project b/cabal.project index f3560ee..4087242 100644 --- a/cabal.project +++ b/cabal.project @@ -1 +1,7 @@ packages: ./country, ./code-generation + +-- https://github.com/byteverse/bytebuild/pull/29 +source-repository-package + type: git + location: https://github.com/parsonsmatt/bytebuild + tag: f65821b505233a98e74786d7c7ec66aabae50a61 diff --git a/country/country.cabal b/country/country.cabal index 04141a4..14b72b7 100644 --- a/country/country.cabal +++ b/country/country.cabal @@ -75,7 +75,7 @@ library , primitive >= 0.6.4 && <0.8 , primitive-unlifted >= 0.1.3 && <1.0 , scientific >=0.3 && <0.4 - , text >= 1.2 && <1.3 + , text >= 1.2 && <2.2 , text-short >=0.1.3 , unordered-containers >=0.2 && <0.3 default-language: Haskell2010 diff --git a/country/src/Continent.hs b/country/src/Continent.hs index 3792dfe..5d5ae3a 100644 --- a/country/src/Continent.hs +++ b/country/src/Continent.hs @@ -26,7 +26,7 @@ import Continent.Unsafe import Control.Monad (forM_) import Control.Monad.ST (runST) import Country.Unexposed.Continents (continentAList) -import Country.Unexposed.Util (mapTextArray,charToWord16,word16ToInt,timesTwo) +import Country.Unexposed.Util (mapTextArray,word16ToInt,timesTwo,charToTextWord) import Country.Unsafe (Country(Country)) import Data.Char (toLower) import Data.Text (Text) @@ -51,8 +51,8 @@ allAlphaUpper = TA.run $ do m <- TA.new (timesTwo numberOfContinents) forM_ continentNameDb $ \(n,_,(a1,a2)) -> do let ix = timesTwo (fromIntegral n) - TA.unsafeWrite m ix (charToWord16 a1) - TA.unsafeWrite m (ix + 1) (charToWord16 a2) + TA.unsafeWrite m ix (charToTextWord a1) + TA.unsafeWrite m (ix + 1) (charToTextWord a2) return m {-# NOINLINE allAlphaUpper #-} diff --git a/country/src/Country.hs b/country/src/Country.hs index 33c6469..0d88c7d 100644 --- a/country/src/Country.hs +++ b/country/src/Country.hs @@ -1,3 +1,4 @@ +{-# LANGUAGE CPP #-} {-# LANGUAGE BangPatterns #-} {-# LANGUAGE DataKinds #-} {-# LANGUAGE MagicHash #-} @@ -40,7 +41,7 @@ import Country.Unexposed.Names (hashMapUtf16,hashMapUtf8) import Country.Unexposed.Names (numberOfPossibleCodes,alphaTwoHashMap,alphaThreeHashMap,decodeMap,decodeMapUtf8,decodeNumeric,encodeEnglish,encodeEnglishShort) import Country.Unexposed.Trie (Trie,trieFromList,trieParser) import Country.Unexposed.TrieByte (TrieByte,trieByteFromList,trieByteParser) -import Country.Unexposed.Util (mapTextArray,charToWord16,word16ToInt,timesTwo,timesThree) +import Country.Unexposed.Util (mapTextArray,charToTextWord,word16ToInt,timesTwo,timesThree) import Country.Unsafe (Country(..)) import Data.Bytes.Types (Bytes(Bytes)) import Data.ByteString (ByteString) @@ -114,10 +115,15 @@ decodeAlphaThree = flip HM.lookup alphaThreeHashMap -- countries. It strives to handle any source language. Open an -- issue on the issue tracker if there are names that are missing. decode :: Text -> Maybe Country +#if MIN_VERSION_text(2,0,0) +decode txt = + decodeUtf8 (TE.encodeUtf8 txt) +#else decode (TI.Text (TA.Array arr) off16 len16) = case (BytesHashMap.lookup (Bytes (ByteArray arr) (off16 * 2) (len16 * 2)) hashMapUtf16) of Nothing -> Nothing Just w -> Just (Country (fromIntegral w)) +#endif -- | Decode a 'Country' from a UTF-8-encoded 'ByteString'. decodeUtf8 :: ByteString -> Maybe Country @@ -164,8 +170,8 @@ allAlphaTwoUpper = TA.run $ do m <- TA.new (timesTwo numberOfCountries) forM_ countryNameQuads $ \(n,_,(a1,a2),_) -> do let ix = timesTwo (indexOfCountry (Country n)) - TA.unsafeWrite m ix (charToWord16 a1) - TA.unsafeWrite m (ix + 1) (charToWord16 a2) + TA.unsafeWrite m ix (charToTextWord a1) + TA.unsafeWrite m (ix + 1) (charToTextWord a2) return m {-# NOINLINE allAlphaTwoUpper #-} @@ -174,9 +180,9 @@ allAlphaThreeUpper = TA.run $ do m <- TA.new (timesThree numberOfCountries) forM_ countryNameQuads $ \(n,_,_,(a1,a2,a3)) -> do let ix = timesThree (indexOfCountry (Country n)) - TA.unsafeWrite m ix (charToWord16 a1) - TA.unsafeWrite m (ix + 1) (charToWord16 a2) - TA.unsafeWrite m (ix + 2) (charToWord16 a3) + TA.unsafeWrite m ix (charToTextWord a1) + TA.unsafeWrite m (ix + 1) (charToTextWord a2) + TA.unsafeWrite m (ix + 2) (charToTextWord a3) return m {-# NOINLINE allAlphaThreeUpper #-} diff --git a/country/src/Country/Unexposed/Names.hs b/country/src/Country/Unexposed/Names.hs index e2d57e8..d49f1a5 100644 --- a/country/src/Country/Unexposed/Names.hs +++ b/country/src/Country/Unexposed/Names.hs @@ -1,3 +1,4 @@ +{-# language CPP #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE BangPatterns #-} {-# LANGUAGE MagicHash #-} @@ -140,7 +141,11 @@ hashMapUtf8 = BytesHashMap.fromTrustedList hashMapUtf16 :: BytesHashMap.Map hashMapUtf16 = BytesHashMap.fromTrustedList ( map +#if MIN_VERSION_text(2,0,0) + (\(a,Text.Text (Text.ByteArray arr) off16 len16) -> +#else (\(a,Text.Text (Text.Array arr) off16 len16) -> +#endif (Bytes (ByteArray arr) (off16 * 2) (len16 * 2),fromIntegral a) ) countryPairs ) diff --git a/country/src/Country/Unexposed/Trie.hs b/country/src/Country/Unexposed/Trie.hs index 216d187..92f56bc 100644 --- a/country/src/Country/Unexposed/Trie.hs +++ b/country/src/Country/Unexposed/Trie.hs @@ -14,8 +14,8 @@ import qualified Data.HashMap.Strict as HM import qualified Data.Attoparsec.Text as AT import qualified Data.Semigroup as SG --- | If the value is not the max Word16 (65535), there --- is a match. This means that 65535 cannot be used, which +-- | If the value is not the max Word16 (65535), there +-- is a match. This means that 65535 cannot be used, which -- is fine for this since 65535 is not used as a country code. data Trie = Trie { trieValue :: {-# UNPACK #-} !Word16 diff --git a/country/src/Country/Unexposed/Util.hs b/country/src/Country/Unexposed/Util.hs index db89ecf..62bcb9e 100644 --- a/country/src/Country/Unexposed/Util.hs +++ b/country/src/Country/Unexposed/Util.hs @@ -1,10 +1,16 @@ +{-# language RankNTypes #-} +{-# LANGUAGE CPP #-} {-# LANGUAGE BangPatterns #-} {-# LANGUAGE MagicHash #-} module Country.Unexposed.Util ( mapTextArray + , textWordToChar + , charToTextWord , charToWord16 , word16ToChar + , charToWord8 + , word8ToChar , word16ToInt , timesTwo , timesThree @@ -13,27 +19,79 @@ module Country.Unexposed.Util import Data.Bits (unsafeShiftL,unsafeShiftR) import Data.Char (chr,ord) -import Data.Word (Word16) +import Data.Word (Word16, Word8) import GHC.Exts (sizeofByteArray#) import GHC.Int (Int(I#)) +import Control.Monad.ST +import Data.Text.Unsafe +import Data.Text.Internal +import qualified Data.Text.Internal.Unsafe.Char as Unsafe.Char import qualified Data.Text.Array as TA mapTextArray :: (Char -> Char) -> TA.Array -> TA.Array +#if MIN_VERSION_text(2,0,0) +mapTextArray f src@(TA.ByteArray arr) = runST $ do + -- this implementation is lifted from the text internals + marr <- TA.new (l + 4) + outer marr (l + 4) o 0 + where + o = 0 + l = I# (sizeofByteArray# arr) + outer :: forall s. TA.MArray s -> Int -> Int -> Int -> ST s TA.Array + outer !dst !dstLen = inner + where + inner !srcOff !dstOff + | srcOff >= l + o = do + TA.shrinkM dst dstOff + arr <- TA.unsafeFreeze dst + return arr + | dstOff + 4 > dstLen = do + let !dstLen' = dstLen + (l + o) - srcOff + 4 + dst' <- TA.resizeM dst dstLen' + outer dst' dstLen' srcOff dstOff + | otherwise = do + let !(Iter c d) = iterArray src srcOff + d' <- Unsafe.Char.unsafeWrite dst dstOff (safe (f c)) + inner (srcOff + d) (dstOff + d') +#else mapTextArray f a@(TA.Array inner) = TA.run $ do let len = half (I# (sizeofByteArray# inner)) m <- TA.new len TA.copyI m 0 a 0 len let go !ix = if ix < len then do - TA.unsafeWrite m ix (charToWord16 (f (word16ToChar (TA.unsafeIndex a ix)))) + TA.unsafeWrite m ix (charToTextWord (f (textWordToChar (TA.unsafeIndex a ix)))) go (ix + 1) else return () go 0 return m +#endif {-# INLINE mapTextArray #-} +#if MIN_VERSION_text(2,0,0) +textWordToChar :: Word8 -> Char +textWordToChar = word8ToChar + +charToTextWord :: Char -> Word8 +charToTextWord = charToWord8 +#else +textWordToChar :: Word16 -> Char +textWordToChar = word16ToChar + +charToTextWord :: Char -> Word16 +charToTextWord = charToWord16 +#endif + +word8ToChar :: Word8 -> Char +word8ToChar = chr . fromIntegral +{-# INLINE word8ToChar #-} + +charToWord8 :: Char -> Word8 +charToWord8 = fromIntegral . ord +{-# INLINE charToWord8 #-} + word16ToChar :: Word16 -> Char word16ToChar = chr . fromIntegral {-# INLINE word16ToChar #-}