diff --git a/.gitignore b/.gitignore index 6b8ee81..d25e37f 100644 --- a/.gitignore +++ b/.gitignore @@ -35,3 +35,5 @@ tags TAGS docs/db/unthreat + +geolite-csv/data/large diff --git a/colonnade/src/Colonnade/Decoding.hs b/colonnade/src/Colonnade/Decoding.hs index 1cc2ef7..07a7748 100644 --- a/colonnade/src/Colonnade/Decoding.hs +++ b/colonnade/src/Colonnade/Decoding.hs @@ -8,6 +8,7 @@ import Colonnade.Types import Data.Functor.Contravariant import Data.Vector (Vector) import qualified Data.Vector as Vector +import Data.Char (chr) -- | Converts the content type of a 'Decoding'. The @'Contravariant' f@ -- constraint means that @f@ can be 'Headless' but not 'Headed'. @@ -101,3 +102,59 @@ headedToIndexed v = getEitherWrap . go <$> EitherWrap rcurrent <*> rnext +-- | This adds one to the index because text editors consider +-- line number to be one-based, not zero-based. +prettyError :: (c -> String) -> DecodingRowError f c -> String +prettyError toStr (DecodingRowError ix e) = unlines + $ ("Decoding error on line " ++ show (ix + 1) ++ " of file.") + : ("Error Category: " ++ descr) + : map (" " ++) errDescrs + where (descr,errDescrs) = prettyRowError toStr e + +prettyRowError :: (content -> String) -> RowError f content -> (String, [String]) +prettyRowError toStr x = case x of + RowErrorParse err -> (,) "CSV Parsing" + [ "The line could not be parsed into cells correctly." + , "Original parser error: " ++ err + ] + RowErrorSize reqLen actualLen -> (,) "Row Length" + [ "Expected the row to have exactly " ++ show reqLen ++ " cells." + , "The row only has " ++ show actualLen ++ " cells." + ] + RowErrorMinSize reqLen actualLen -> (,) "Row Min Length" + [ "Expected the row to have at least " ++ show reqLen ++ " cells." + , "The row only has " ++ show actualLen ++ " cells." + ] + RowErrorMalformed enc -> (,) "Text Decoding" + [ "Tried to decode the input as " ++ enc ++ " text" + , "There is a mistake in the encoding of the text." + ] + RowErrorHeading errs -> (,) "Header" (prettyHeadingErrors toStr errs) + RowErrorDecode errs -> (,) "Cell Decoding" (prettyCellErrors toStr errs) + +prettyCellErrors :: (c -> String) -> DecodingCellErrors f c -> [String] +prettyCellErrors toStr (DecodingCellErrors errs) = drop 1 $ + flip concatMap errs $ \(DecodingCellError content (Indexed ix _) msg) -> + let str = toStr content in + [ "-----------" + , "Column " ++ columnNumToLetters ix + , "Original parse error: " ++ msg + , "Cell Content Length: " ++ show (Prelude.length str) + , "Cell Content: " ++ if null str + then "[empty cell]" + else str + ] + +prettyHeadingErrors :: (c -> String) -> HeadingErrors c -> [String] +prettyHeadingErrors conv (HeadingErrors missing duplicates) = concat + [ concatMap (\h -> ["The header " ++ conv h ++ " was missing."]) missing + , concatMap (\(h,n) -> ["The header " ++ conv h ++ " occurred " ++ show n ++ " times."]) duplicates + ] + +columnNumToLetters :: Int -> String +columnNumToLetters i + | i >= 0 && i < 25 = [chr (i + 65)] + | otherwise = "Beyond Z. Fix this." + + + diff --git a/colonnade/src/Colonnade/Types.hs b/colonnade/src/Colonnade/Types.hs index f9edcd8..15bb0b6 100644 --- a/colonnade/src/Colonnade/Types.hs +++ b/colonnade/src/Colonnade/Types.hs @@ -1,6 +1,7 @@ -{-# LANGUAGE DeriveFunctor #-} +{-# LANGUAGE DeriveFunctor #-} +{-# LANGUAGE DeriveFoldable #-} {-# LANGUAGE GeneralizedNewtypeDeriving #-} -{-# LANGUAGE GADTs #-} +{-# LANGUAGE GADTs #-} module Colonnade.Types ( Encoding(..) , Decoding(..) @@ -24,11 +25,11 @@ import qualified Data.Vector as Vector -- | Isomorphic to 'Identity' newtype Headed a = Headed { getHeaded :: a } - deriving (Eq,Ord,Functor,Show,Read) + deriving (Eq,Ord,Functor,Show,Read,Foldable) -- | Isomorphic to 'Proxy' data Headless a = Headless - deriving (Eq,Ord,Functor,Show,Read) + deriving (Eq,Ord,Functor,Show,Read,Foldable) data Indexed f a = Indexed { indexedIndex :: !Int @@ -76,6 +77,7 @@ data RowError f content | RowErrorSize !Int !Int -- ^ Wrong number of cells in the row | RowErrorHeading !(HeadingErrors content) | RowErrorMinSize !Int !Int + | RowErrorMalformed !String -- ^ Error decoding unicode content deriving (Show,Read,Eq) -- instance (Show (f content), Typeable content) => Exception (DecodingErrors f content) diff --git a/geolite-csv/data/GeoLite2-City-Blocks-IPv4.small.csv b/geolite-csv/data/small/GeoLite2-City-Blocks-IPv4.csv similarity index 100% rename from geolite-csv/data/GeoLite2-City-Blocks-IPv4.small.csv rename to geolite-csv/data/small/GeoLite2-City-Blocks-IPv4.csv diff --git a/geolite-csv/data/GeoLite2-City-Locations-en.small.csv b/geolite-csv/data/small/GeoLite2-City-Locations-en.csv similarity index 100% rename from geolite-csv/data/GeoLite2-City-Locations-en.small.csv rename to geolite-csv/data/small/GeoLite2-City-Locations-en.csv diff --git a/geolite-csv/data/GeoLite2-City-Locations-ja.small.csv b/geolite-csv/data/small/GeoLite2-City-Locations-ja.csv similarity index 100% rename from geolite-csv/data/GeoLite2-City-Locations-ja.small.csv rename to geolite-csv/data/small/GeoLite2-City-Locations-ja.csv diff --git a/geolite-csv/geolite-csv.cabal b/geolite-csv/geolite-csv.cabal index 7561a30..c93b3a5 100644 --- a/geolite-csv/geolite-csv.cabal +++ b/geolite-csv/geolite-csv.cabal @@ -43,6 +43,7 @@ test-suite geolite-csv-test , test-framework-hunit , pipes-bytestring , pipes-text + , directory ghc-options: -threaded -rtsopts -with-rtsopts=-N default-language: Haskell2010 diff --git a/geolite-csv/scripts/load-full-databases b/geolite-csv/scripts/load-full-databases new file mode 100755 index 0000000..f78ea92 --- /dev/null +++ b/geolite-csv/scripts/load-full-databases @@ -0,0 +1,35 @@ +#!/bin/bash + +set -e + +current_dir="${PWD##*/}" + +echo "Current directory is: $current_dir" + +if [ "$current_dir" = "colonnade" ] +then + cd ./geolite-csv +fi + +new_current_dir="${PWD##*/}" +if [ "$new_current_dir" != "geolite-csv" ] +then + echo "Not currently in the geolite project directory. Exiting." + exit 1 +fi + +mkdir -p ./data/large +cd ./data/large + +rm -f *.zip +rm -rf GeoLite2-* + +curl 'http://geolite.maxmind.com/download/geoip/database/GeoLite2-City-CSV.zip' > archive.zip +unzip archive.zip -d ./ + +cd GeoLite2-City-CSV* +mv *.csv ../ +cd ../ +rm -rf GeoLite2-City-CSV* +rm archive.zip + diff --git a/geolite-csv/src/Geolite/Csv.hs b/geolite-csv/src/Geolite/Csv.hs index 2e49282..2384c8a 100644 --- a/geolite-csv/src/Geolite/Csv.hs +++ b/geolite-csv/src/Geolite/Csv.hs @@ -39,16 +39,20 @@ decodingCity = City decodingBlock :: Decoding Headed Text Block decodingBlock = Block <$> CD.headed "network" IPv4RangeText.decodeEither - <*> CD.headed "geoname_id" (CDT.map GeonameId CDT.int) + <*> CD.headed "geoname_id" + (CDT.optional $ CDT.map GeonameId CDT.int) <*> CD.headed "registered_country_geoname_id" - (CDT.map GeonameId CDT.int) + (CDT.optional $ CDT.map GeonameId CDT.int) <*> CD.headed "represented_country_geoname_id" (CDT.optional $ CDT.map GeonameId CDT.int) <*> CD.headed "is_anonymous_proxy" (CDT.trueFalse "1" "0") <*> CD.headed "is_satellite_provider" (CDT.trueFalse "1" "0") <*> CD.headed "postal_code" CDT.text - <*> CD.headed "latitude" (CDT.fromReader TextRead.rational) - <*> CD.headed "longitude" (CDT.fromReader TextRead.rational) - <*> CD.headed "accuracy_radius" CDT.int + <*> CD.headed "latitude" + (CDT.optional $ CDT.fromReader TextRead.rational) + <*> CD.headed "longitude" + (CDT.optional $ CDT.fromReader TextRead.rational) + <*> CD.headed "accuracy_radius" + (CDT.optional CDT.int) diff --git a/geolite-csv/src/Geolite/Types.hs b/geolite-csv/src/Geolite/Types.hs index 94a0fab..a132627 100644 --- a/geolite-csv/src/Geolite/Types.hs +++ b/geolite-csv/src/Geolite/Types.hs @@ -29,14 +29,14 @@ data City = City data Block = Block { blockNetwork :: IPv4Range - , blockGeonameId :: GeonameId - , blockRegisteredCountryGeonameId :: GeonameId + , blockGeonameId :: Maybe GeonameId + , blockRegisteredCountryGeonameId :: Maybe GeonameId , blockRepresentedCountryGeonameId :: Maybe GeonameId , blockIsAnonymousProxy :: Bool , blockIsSatelliteProvider :: Bool , blockPostalCode :: Text - , blockLatitude :: Fixed E4 - , blockLongitude :: Fixed E4 - , blockAccuracyRadius :: Int + , blockLatitude :: Maybe (Fixed E4) + , blockLongitude :: Maybe (Fixed E4) + , blockAccuracyRadius :: Maybe Int } deriving (Show,Read,Eq,Ord) diff --git a/geolite-csv/test/Spec.hs b/geolite-csv/test/Spec.hs index ef86751..69436d0 100644 --- a/geolite-csv/test/Spec.hs +++ b/geolite-csv/test/Spec.hs @@ -2,33 +2,67 @@ module Main (main) where -import Test.HUnit (Assertion,(@?=)) -import Test.Framework (defaultMain, testGroup, Test) +import Test.HUnit (Assertion,(@?=),assertBool,assertFailure) +import Test.Framework (defaultMainWithOpts, interpretArgsOrExit, + testGroup, Test) import Test.Framework.Providers.HUnit (testCase) +import Test.Framework.Runners.TestPattern (parseTestPattern) +import Test.Framework.Runners.Options (RunnerOptions'(..)) import Geolite.Csv (cities,blocks) import Data.Text (Text) import Colonnade.Types import Siphon.Types import Data.Functor.Identity +import Control.Monad (unless) +import System.Environment (getArgs) +import System.Directory (doesDirectoryExist) import System.IO (withFile,IOMode(ReadMode)) import qualified Data.Text as Text import qualified Pipes.Prelude as Pipes import qualified Pipes.ByteString as PB import qualified Pipes.Text.Encoding as PT import qualified Siphon.Decoding as SD +import qualified Colonnade.Decoding as Decoding import Pipes +------------------------------------------------ +-- The default behavior of this test suite is to +-- test the CSV decoders against small samples of +-- the GeoLite2 databases. These small samples are +-- included as part of this repository. If you give +-- this test suite an argument named "large", it +-- will run against the full CSVs, which are around +-- 350MB. These are not included +-- as part of the repository, so they need to be +-- downloaded. The script found in +-- scripts/load-full-databases will download the full +-- archive, decompress it, and move the files to +-- the appropriate directory for this test suite +-- to run on them. +----------------------------------------------- + main :: IO () -main = defaultMain tests +main = do + xs <- getArgs + ropts' <- interpretArgsOrExit xs + let ropts = ropts' + { ropt_test_patterns = case ropt_test_patterns ropts' of + Nothing -> Just [parseTestPattern "small"] + Just xs -> Just xs + } + defaultMainWithOpts tests ropts tests :: [Test] -tests = - [ testGroup "Geolite CSV Decoding" +tests = flip concatMap ["small","large"] $ \size -> + [ testGroup size [ testCase "Network Blocks" $ streamFileWith - "data/GeoLite2-City-Blocks-IPv4.small.csv" + ("data/" ++ size ++ "/GeoLite2-City-Blocks-IPv4.csv") blocks , testCase "English City Locations" $ streamFileWith - "data/GeoLite2-City-Locations-en.small.csv" + ("data/" ++ size ++ "/GeoLite2-City-Locations-en.csv") + cities + , testCase "Japanese City Locations" $ streamFileWith + ("data/" ++ size ++ "/GeoLite2-City-Locations-ja.csv") cities ] ] @@ -39,8 +73,19 @@ streamFileWith :: -> Assertion streamFileWith filename decodingPipe = do r <- withFile filename ReadMode $ \h -> runEffect $ - fmap SD.csvResultFromEither (PT.decode (PT.utf8 . PT.eof) $ PB.fromHandle h) - >-> fmap SD.csvResultFromDecodingRowError decodingPipe + fmap (SD.convertDecodeError "utf-8") (PT.decode (PT.utf8 . PT.eof) $ PB.fromHandle h) + >-> fmap Just decodingPipe >-> Pipes.drain - r @?= CsvResultSuccess + case r of + Nothing -> assertBool "impossible" True + Just err -> assertFailure (Decoding.prettyError Text.unpack err) +-- let dirPiece = case xs of +-- ["full"] -> "large/" +-- _ -> "small/" +-- fullDirName = "data/" ++ dirPiece +-- errMsg = concat +-- [ "The " +-- , fullDirName +-- , " directory does not exist in the geolite project" +-- ] diff --git a/siphon/src/Siphon/Decoding.hs b/siphon/src/Siphon/Decoding.hs index 188d8ce..5f3a554 100644 --- a/siphon/src/Siphon/Decoding.hs +++ b/siphon/src/Siphon/Decoding.hs @@ -27,12 +27,12 @@ mkParseError i ctxs msg = id , "]" ] -csvResultFromEither :: Either (Producer ByteString m ()) () -> CsvResult f c -csvResultFromEither (Left _) = CsvResultTextDecodeError -csvResultFromEither (Right ()) = CsvResultSuccess - -csvResultFromDecodingRowError :: DecodingRowError f c -> CsvResult f c -csvResultFromDecodingRowError = CsvResultDecodeError +-- | This is a convenience function for working with @pipes-text@. +-- It will convert a UTF-8 decoding error into a `DecodingRowError`, +-- so the pipes can be properly chained together. +convertDecodeError :: String -> Either (Producer ByteString m ()) () -> Maybe (DecodingRowError f c) +convertDecodeError encodingName (Left _) = Just (DecodingRowError 0 (RowErrorMalformed encodingName)) +convertDecodeError _ (Right ()) = Nothing -- | This is seldom useful but is included for completeness. headlessPipe :: Monad m @@ -145,7 +145,8 @@ pipeGeneral initIx (Siphon _ _ parse isNull) wrapParseError decodeRow mleftovers Left err -> return err Right r -> do yield r - if isNull c1 then go1 ix else go2 ix c1 + let ixNext = ix + 1 + if isNull c1 then go1 ixNext else go2 ixNext c1 Atto.Partial k -> go3 ix k awaitSkip :: Monad m diff --git a/siphon/src/Siphon/Types.hs b/siphon/src/Siphon/Types.hs index 8e5cfe8..01184a0 100644 --- a/siphon/src/Siphon/Types.hs +++ b/siphon/src/Siphon/Types.hs @@ -13,12 +13,12 @@ data Siphon c = Siphon , siphonNull :: c -> Bool } --- | This type is provided for convenience with @pipes-text@ -data CsvResult f c - = CsvResultSuccess - | CsvResultTextDecodeError - | CsvResultDecodeError (DecodingRowError f c) - deriving (Show,Read,Eq) +-- -- | This type is provided for convenience with @pipes-text@ +-- data CsvResult f c +-- = CsvResultSuccess +-- | CsvResultTextDecodeError +-- | CsvResultDecodeError (DecodingRowError f c) +-- deriving (Show,Read,Eq) -- | Consider changing out the use of 'Vector' here