Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions src/Poseidon/CLI/Forge.hs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ module Poseidon.CLI.Forge where

import Poseidon.BibFile (BibEntry (..), BibTeX,
writeBibTeXFile)
import Poseidon.ColumnTypesJanno (PoseidonID (..))
import Poseidon.ColumnTypesUtils (ListColumn (..),
getMaybeListColumn)
import Poseidon.EntityTypes (EntityInput,
Expand Down Expand Up @@ -329,9 +330,9 @@ sumNonMissingSNPs accumulator (_, geno) = do
filterSeqSourceRows :: JannoRows -> SeqSourceRows -> SeqSourceRows
filterSeqSourceRows (JannoRows jRows) (SeqSourceRows sRows) =
let desiredPoseidonIDs = map jPoseidonID jRows
in SeqSourceRows $ filter (hasAPoseidonID desiredPoseidonIDs) sRows
in SeqSourceRows $ filter (hasAPoseidonID desiredPoseidonIDs) sRows
where
hasAPoseidonID :: [String] -> SeqSourceRow -> Bool
hasAPoseidonID :: [PoseidonID] -> SeqSourceRow -> Bool
hasAPoseidonID jIDs seqSourceRow =
let sIDs = getMaybeListColumn $ sPoseidonID seqSourceRow
in any (`elem` jIDs) sIDs
Expand Down
4 changes: 2 additions & 2 deletions src/Poseidon/CLI/Serve.hs
Original file line number Diff line number Diff line change
Expand Up @@ -337,7 +337,7 @@ extractPosJannoRow archiveName pacName pacVersion row = case (jLatitude row, jLo
let poseidonID = jPoseidonID row
loc = show <$> jLocation row
age = show <$> jDateBCADMedian row
in Just $ MapMarker lat lon poseidonID pacName pacVersion archiveName loc age
in Just $ MapMarker lat lon (show poseidonID) pacName pacVersion archiveName loc age
_ -> Nothing

prepPacVersions :: String -> [PoseidonPackage] -> ActionM [PoseidonPackage]
Expand All @@ -360,7 +360,7 @@ prepSamples pac = return $ getJannoRowsFromPac pac

prepSample :: String -> [JannoRow] -> ActionM JannoRow
prepSample sampleName rows = do
case filter (\r -> jPoseidonID r == sampleName) rows of
case filter (\r -> show (jPoseidonID r) == sampleName) rows of
[] -> fail $ "Sample " <> sampleName <> " does not exist"
[x] -> return x
_ -> fail $ "Sample " <> sampleName <> " exists multiple times"
Expand Down
2 changes: 1 addition & 1 deletion src/Poseidon/CLI/Summarise.hs
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ summariseJannoRows (JannoRows rows) rawOutput = do
["Nr Samples"
, show (length rows)],
["Samples"
, paste . sort . map jPoseidonID $ rows],
, paste . sort . map (show . jPoseidonID) $ rows],
["Nr Primary Groups"
, uniqueNumber . map (head . getListColumn . jGroupName) $ rows],
["Primary Groups"
Expand Down
5 changes: 4 additions & 1 deletion src/Poseidon/CLI/Survey.hs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
module Poseidon.CLI.Survey where

import Poseidon.BibFile (BibTeX)
import Poseidon.ColumnTypesJanno (GeneticSex (..))
import Poseidon.ColumnTypesJanno (GeneticSex (..), PoseidonID (..))
import Poseidon.ColumnTypesUtils (CsvNamedRecord, ListColumn (..))
import Poseidon.GenotypeData (GenotypeDataSpec (..),
GenotypeFileSpec (..))
Expand Down Expand Up @@ -123,6 +123,9 @@ renderJannoCompleteness (JannoRows rows) =
-- A typeclass to determine if a field in a .janno row is filled
class PresenceCountable a where
countPresence :: a -> Int

instance PresenceCountable PoseidonID where
countPresence _ = 1
instance PresenceCountable (Maybe a) where
countPresence Nothing = 0
countPresence (Just _) = 1
Expand Down
58 changes: 55 additions & 3 deletions src/Poseidon/ColumnTypesJanno.hs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE InstanceSigs #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE TemplateHaskell #-}

Expand All @@ -9,7 +10,10 @@ import Poseidon.ColumnTypesUtils

import Country (Country, alphaTwoUpper,
decodeAlphaTwo)
import qualified Data.ByteString.Char8 as B
import qualified Data.Char as C
import qualified Data.Csv as Csv
import Data.String (IsString (..))
import qualified Data.Text as T
import qualified Data.Text.Read as T
import GHC.Generics (Generic)
Expand Down Expand Up @@ -40,11 +44,59 @@ instance Ord GeneticSex where
compare (GeneticSex Unknown) (GeneticSex Female) = LT
compare _ _ = EQ
instance Csv.ToField GeneticSex where toField x = Csv.toField $ show x
instance Csv.FromField GeneticSex where parseField = parseTypeCSV "Genetic_Sex"
instance Csv.FromField GeneticSex where parseField :: Csv.Field -> Csv.Parser GeneticSex
parseField = parseTypeCSV "Genetic_Sex"

newtype PoseidonID = PoseidonID {unPoseidonID :: B.ByteString} deriving (Eq, Ord)

instance Makeable PoseidonID where
make txt =
if T.all isValidPoseidonIDChar txt
then return . PoseidonID . B.pack . T.unpack $ txt
else fail $ "PoseidonID contains invalid characters: " ++ T.unpack txt

isValidPoseidonIDChar :: Char -> Bool
isValidPoseidonIDChar c =
C.isAscii c && (C.isAlphaNum c || c `elem` ['_', '-', '.'])

instance Suspicious PoseidonID where inspect _ = Nothing

instance Show PoseidonID where
show (PoseidonID x) = B.unpack x

instance Csv.ToField PoseidonID where
toField (PoseidonID x) = Csv.toField x

instance Csv.FromField PoseidonID where
parseField = parseTypeCSV "PoseidonID"

-- the IsString instance allows us to write PoseidonID "MyID" directly, mainly for testing purposes.
instance IsString PoseidonID where
fromString str = PoseidonID (B.pack str)

-- | A datatype for the Group_Name .janno column
newtype GroupName = GroupName T.Text deriving (Eq, Ord)
$(makeInstances ''GroupName "Group_Name")
newtype GroupName = GroupName {unGroupName :: B.ByteString} deriving (Eq, Ord)

instance Makeable GroupName where
make txt =
if T.all isValidPoseidonIDChar txt
then return . GroupName . B.pack . T.unpack $ txt
else fail $ "GroupName contains invalid characters: " ++ T.unpack txt

instance Suspicious GroupName where inspect _ = Nothing

instance Show GroupName where
show (GroupName x) = B.unpack x

instance Csv.ToField GroupName where
toField (GroupName x) = Csv.toField x

instance Csv.FromField GroupName where
parseField = parseTypeCSV "Group_Name"

-- the IsString instance allows us to write GroupName "MyGroup" directly, mainly for testing purposes.
instance IsString GroupName where
fromString str = GroupName (B.pack str)

-- | A datatype for the Alternative_IDs .janno column
newtype JannoAlternativeID = JannoAlternativeID T.Text deriving (Eq)
Expand Down
11 changes: 6 additions & 5 deletions src/Poseidon/GenotypeData.hs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@ module Poseidon.GenotypeData where

import Paths_poseidon_hs (version)
import Poseidon.ColumnTypesJanno (GroupName (..),
JannoGenotypePloidy (..))
JannoGenotypePloidy (..),
PoseidonID (..))
import Poseidon.ColumnTypesUtils (ListColumn (..))
import Poseidon.Janno (JannoRow (..))
import Poseidon.Utils (LogA, PoseidonException (..),
Expand Down Expand Up @@ -413,12 +414,12 @@ selectIndices indices (snpEntry, genoLine) = (snpEntry, V.fromList [genoLine V.!

writeVCF :: (MonadSafe m) => LogA -> [JannoRow] -> FilePath -> Consumer (EigenstratSnpEntry, GenoLine) m ()
writeVCF logA jannoRows vcfFile = do
let sampleNames = map (B.pack . jPoseidonID) jannoRows
groupNames = map ((\(GroupName n) -> T.unpack n) . head . getListColumn . jGroupName) jannoRows
let sampleNames = map (unPoseidonID . jPoseidonID) jannoRows
groupNames = map (B.unpack . unGroupName . head . getListColumn . jGroupName) jannoRows
sex = map jGeneticSex jannoRows
forM_ jannoRows $ \jannoRow -> do
when (jGenotypePloidy jannoRow == Nothing) . logWithEnv logA . logWarning $
"Missing GenotypePloidy for individual ++ " ++ jPoseidonID jannoRow ++
"Missing GenotypePloidy for individual ++ " ++ show (jPoseidonID jannoRow) ++
". For VCF output I will assume diploid genotypes. " ++
"Please set the GenotypePloidy column explitly in the Janno File to haploid or diploid."
let metaInfoLines = map B.pack [
Expand Down Expand Up @@ -452,7 +453,7 @@ createVCFentry logA jannoRows (EigenstratSnpEntry chrom pos _ id_ ref alt, genoL
(Missing, Just Haploid) -> return ["."]
(HomRef , Just Haploid) -> return ["0"]
(Het , Just Haploid) -> do
logWithEnv logA . logWarning $ "Encountered a heterozygous genotype for " ++ s ++
logWithEnv logA . logWarning $ "Encountered a heterozygous genotype for " ++ show (unPoseidonID s) ++
" at position " ++ show chrom ++ ":" ++ show pos ++ ", but the individual's GenotypePloidy is given as " ++
" Haploid in the Janno-File. I have to encode this in the VCF as a diploid genotype. Consider changing this " ++
"individual's GenotypePloidy to diploid!"
Expand Down
15 changes: 7 additions & 8 deletions src/Poseidon/Janno.hs
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@ import Data.List (elemIndex, foldl',
intercalate, nub, sort,
transpose, (\\))
import Data.Maybe (catMaybes, fromJust)
import qualified Data.Text as T
import qualified Data.Vector as V
import Generics.SOP.TH (deriveGeneric)
import GHC.Generics (Generic)
Expand Down Expand Up @@ -77,7 +76,7 @@ instance Monoid JannoRows where
-- See https://github.com/poseidon-framework/poseidon2-schema/blob/master/janno_columns.tsv
-- for more details
data JannoRow = JannoRow
{ jPoseidonID :: String
{ jPoseidonID :: PoseidonID
, jGeneticSex :: GeneticSex
, jGroupName :: ListColumn GroupName
, jAlternativeIDs :: Maybe (ListColumn JannoAlternativeID)
Expand Down Expand Up @@ -288,9 +287,9 @@ createMinimalJanno xs = JannoRows $ map createMinimalSample xs
createMinimalSample :: EigenstratIndEntry -> JannoRow
createMinimalSample (EigenstratIndEntry id_ sex pop) =
JannoRow {
jPoseidonID = Bchs.unpack id_ -- TODO: this will have to change. We need to make PoseidonID itself ByteString
jPoseidonID = PoseidonID id_
, jGeneticSex = GeneticSex sex
, jGroupName = ListColumn [GroupName . T.pack . Bchs.unpack $ pop] -- same thing, see above.
, jGroupName = ListColumn [GroupName pop]
, jAlternativeIDs = Nothing
, jRelationTo = Nothing
, jRelationDegree = Nothing
Expand Down Expand Up @@ -450,8 +449,7 @@ readJannoFileRow mandatoryCols jannoPath (lineNumber, row) = do
renderWarning e = "Cross-column anomaly in " ++ jannoPath ++ " " ++
"in line " ++ renderLocation ++ ": " ++ e
renderLocation :: String
renderLocation = show lineNumber ++
" (Poseidon_ID: " ++ jPoseidonID jannoRow ++ ")"
renderLocation = show lineNumber ++ " (Poseidon_ID: " ++ show (jPoseidonID jannoRow) ++ ")"

-- Global janno consistency checks

Expand Down Expand Up @@ -479,7 +477,7 @@ checkJannoRowConsistency x =

checkMandatoryStringNotEmpty :: JannoRow -> JannoRowLog JannoRow
checkMandatoryStringNotEmpty x =
let notEmpty = (not . null . jPoseidonID $ x) &&
let notEmpty = (not . Bchs.null . unPoseidonID . jPoseidonID $ x) &&
(not . null . getListColumn . jGroupName $ x) &&
(not . null . show . head . getListColumn . jGroupName $ x)
in case notEmpty of
Expand Down Expand Up @@ -543,4 +541,5 @@ jannoRows2EigenstratIndEntries :: JannoRows -> [EigenstratIndEntry]
jannoRows2EigenstratIndEntries (JannoRows jannoRows) = do -- list monad
jannoRow <- jannoRows -- looping over jannoRows
let GroupName gText = head . getListColumn . jGroupName $ jannoRow
return $ EigenstratIndEntry (Bchs.pack $ jPoseidonID jannoRow) (sfSex (jGeneticSex jannoRow)) (Bchs.pack $ T.unpack gText)
PoseidonID id_ = jPoseidonID jannoRow
return $ EigenstratIndEntry id_ (sfSex (jGeneticSex jannoRow)) gText
32 changes: 16 additions & 16 deletions src/Poseidon/Package.hs
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,10 @@ module Poseidon.Package (

import Poseidon.BibFile (BibEntry (..), BibTeX,
readBibTeXFile)
import Poseidon.ColumnTypesJanno (GeneticSex (..),
import Poseidon.ColumnTypesJanno (GeneticSex (..), GroupName (..),
JannoLibraryBuilt (..),
JannoPublication (..),
JannoUDG (..))
JannoUDG (..), PoseidonID (..))
import Poseidon.ColumnTypesSSF (SSFLibraryBuilt (..), SSFUDG (..))
import Poseidon.ColumnTypesUtils (ListColumn (..),
getMaybeListColumn)
Expand Down Expand Up @@ -542,28 +542,28 @@ checkFiles baseDir ignoreChecksums ignoreGenotypeFilesMissing yml = do
-- the last flag is important for reading VCFs, which can lack group and sex information.
checkJannoIndConsistency :: String -> JannoRows -> [EigenstratIndEntry] -> Bool -> IO ()
checkJannoIndConsistency pacName (JannoRows rows) indEntries isVCF = do
let genoIDs = [ BSC.unpack x | EigenstratIndEntry x _ _ <- indEntries]
let genoIDs = [ x | EigenstratIndEntry x _ _ <- indEntries]
genoSexs = [ x | EigenstratIndEntry _ x _ <- indEntries]
genoGroups = [ BSC.unpack x | EigenstratIndEntry _ _ x <- indEntries]
let jannoIDs = map jPoseidonID rows
genoGroups = [ x | EigenstratIndEntry _ _ x <- indEntries]
let jannoIDs = map (unPoseidonID . jPoseidonID) rows
jannoSexs = map (sfSex . jGeneticSex) rows
jannoGroups = map (show . head . getListColumn . jGroupName) rows
jannoGroups = map (unGroupName . head . getListColumn . jGroupName) rows
let idMis = genoIDs /= jannoIDs
sexMis = genoSexs /= jannoSexs
groupMis = genoGroups /= jannoGroups
let sexAllUnknown = all (==Unknown) genoSexs
groupsAllUnknown = all (=="unknown") genoGroups
when idMis $ throwM $ PoseidonCrossFileConsistencyException pacName $
"Individual ID mismatch between genotype data (left) and .janno files (right): " ++
renderMismatch genoIDs jannoIDs
renderMismatch (map show genoIDs) (map show jannoIDs)
when (sexMis && not (isVCF && sexAllUnknown)) $ throwM $ PoseidonCrossFileConsistencyException pacName $
"Individual Sex mismatch between genotype data (left) and .janno files (right): " ++
renderMismatch (map show genoSexs) (map show jannoSexs)
when (groupMis && not (isVCF && groupsAllUnknown)) $ throwM $ PoseidonCrossFileConsistencyException pacName $
"Individual GroupID mismatch between genotype data (left) and .janno files (right). Note \
\that this could be due to a wrong Plink file population-name encoding \
\(see the --inPlinkPopName option). " ++
renderMismatch genoGroups jannoGroups
renderMismatch (map show genoGroups) (map show jannoGroups)

renderMismatch :: [String] -> [String] -> String
renderMismatch a b =
Expand All @@ -583,20 +583,20 @@ checkSeqSourceJannoConsistency pacName (SeqSourceRows sRows) (JannoRows jRows) =
checkPoseidonIDOverlap
checkUDGandLibraryBuiltOverlap
where
js = map (\r -> (jPoseidonID r, jUDG r, jLibraryBuilt r)) jRows
ss = map (\r -> (getMaybeListColumn $ sPoseidonID r, sUDG r, sLibraryBuilt r)) sRows
js = [(jPoseidonID r, jUDG r, jLibraryBuilt r) | r <- jRows]
ss = [(getMaybeListColumn $ sPoseidonID r, sUDG r, sLibraryBuilt r) | r <- sRows]
checkPoseidonIDOverlap :: PoseidonIO ()
checkPoseidonIDOverlap = do
let flatSeqSourceIDs = nub $ concat $ [a | (a,_,_) <- ss]
misMatch = flatSeqSourceIDs \\ [a | (a,_,_) <- js]
unless (null misMatch) $ do
logWarning $ "The .ssf file in the package " ++ pacName ++
" features Poseidon_IDs that are not in the package: " ++ intercalate ", " misMatch
" features Poseidon_IDs that are not in the package: " ++ (intercalate ", " . map show $ misMatch)
checkUDGandLibraryBuiltOverlap :: PoseidonIO ()
checkUDGandLibraryBuiltOverlap = do
mapM_ checkOneIndividual js
where
checkOneIndividual :: (String, Maybe JannoUDG, Maybe JannoLibraryBuilt) -> PoseidonIO ()
checkOneIndividual :: (PoseidonID, Maybe JannoUDG, Maybe JannoLibraryBuilt) -> PoseidonIO ()
checkOneIndividual (jannoPoseidonID, jannoUDG, jannoLibraryBuilt) = do
let relevantSeqSourceRows = filter (\(seqSourcePoseidonID,_,_) -> jannoPoseidonID `elem` seqSourcePoseidonID) ss
allSeqSourceUDGs = catMaybes $ [b | (_,b,_) <- relevantSeqSourceRows]
Expand All @@ -606,13 +606,13 @@ checkSeqSourceJannoConsistency pacName (SeqSourceRows sRows) (JannoRows jRows) =
Just j -> unless (all (compareU j) allSeqSourceUDGs) $
throwM $ PoseidonCrossFileConsistencyException pacName $
"The information on UDG treatment in .janno and .ssf do not match" ++
" for the individual: " ++ jannoPoseidonID ++ " (" ++ show j ++ " <> " ++ show allSeqSourceUDGs ++ ")"
" for the individual: " ++ show jannoPoseidonID ++ " (" ++ show j ++ " <> " ++ show allSeqSourceUDGs ++ ")"
case jannoLibraryBuilt of
Nothing -> return ()
Just j -> unless (all (compareL j) allSeqSourceLibraryBuilts) $
throwM $ PoseidonCrossFileConsistencyException pacName $
"The information on library strandedness in .janno and .ssf do not match" ++
" for the individual: " ++ jannoPoseidonID ++ " (" ++ show j ++ " <> " ++ show allSeqSourceLibraryBuilts ++ ")"
" for the individual: " ++ show jannoPoseidonID ++ " (" ++ show j ++ " <> " ++ show allSeqSourceLibraryBuilts ++ ")"
compareU :: JannoUDG -> SSFUDG -> Bool
compareU Mixed _ = True
compareU Minus SSFMinus = True
Expand Down Expand Up @@ -872,7 +872,7 @@ getJointIndividualInfo packages = do
isLatest <- isLatestInCollection packages pac
forM (getJannoRowsFromPac pac) $ \jannoRow -> do
let indInfo = IndividualInfo
(jPoseidonID jannoRow)
(show . jPoseidonID $ jannoRow)
((map show . getListColumn . jGroupName) jannoRow)
(makePacNameAndVersion pac)
return (indInfo, isLatest)
Expand All @@ -883,7 +883,7 @@ getExtendedIndividualInfo :: (MonadThrow m) => [PoseidonPackage] -> AddColSpec -
getExtendedIndividualInfo allPackages addJannoColSpec = sequence $ do -- list monad
pac <- allPackages -- outer loop (automatically concatenating over inner loops)
jannoRow <- getJannoRowsFromPac pac -- inner loop
let name = jPoseidonID jannoRow
let name = show $ jPoseidonID jannoRow
groups = map show $ getListColumn . jGroupName $ jannoRow
colNames = case addJannoColSpec of
AddColAll -> jannoHeaderString \\ ["Poseidon_ID", "Group_Name"] -- Nothing means all Janno columns
Expand Down
Loading