Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 23 additions & 2 deletions src/Poseidon/CLI/Forge.hs
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ import Control.Exception (catch, throwIO)
import Control.Monad (filterM, forM, forM_, unless,
when)
import Data.List (intercalate, nub)
import Data.Maybe (mapMaybe)
import Data.Maybe (catMaybes, mapMaybe)
import Data.Time (getCurrentTime)
import qualified Data.Vector as V
import qualified Data.Vector.Unboxed as VU
Expand Down Expand Up @@ -186,6 +186,7 @@ runForge (
maybeSnpFile of
Nothing -> snpSetMergeList snpSetList intersect_
Just _ -> SNPSetOther
(newRefName, newRefUrl) <- fillMissingReferenceAssemblyInfo relevantPackages
-- compile genotype data structure
let gz = if outZip then "gz" else ""
genotypeFileData <- case outFormat of
Expand All @@ -198,7 +199,7 @@ runForge (
(outName <.> "bim" <.> gz) Nothing
(outName <.> "fam") Nothing
GenotypeOutFormatVCF -> return $ GenotypeVCF (outName <.> "vcf" <.> gz) Nothing
let genotypeData = GenotypeDataSpec genotypeFileData (Just newSNPSet)
let genotypeData = GenotypeDataSpec genotypeFileData (Just newSNPSet) newRefName newRefUrl

-- assemble and write result depending on outMode --
logInfo "Creating new package entity"
Expand Down Expand Up @@ -351,3 +352,23 @@ fillMissingSnpSets packages = forM packages $ \pac -> do
logWarning $ "Warning for package " ++ show pac_ ++ ": field \"snpSet\" \
\is not set. I will interpret this as \"snpSet: Other\""
return SNPSetOther

fillMissingReferenceAssemblyInfo :: [PoseidonPackage] -> PoseidonIO (Maybe String, Maybe String)
fillMissingReferenceAssemblyInfo packages = do
let refNames = map (genotypeRefAssemblyName . posPacGenotypeData) packages
refUrls = map (genotypeRefAssemblyName . posPacGenotypeData) packages
uniqueRefNames = nub $ catMaybes refNames
uniqueRefUrls = nub $ catMaybes refUrls
when (length uniqueRefNames > 1) $
logWarning $ "different reference genome assembly names given: " ++ show uniqueRefNames ++
". I will pick the first for the forge output file"
when (length uniqueRefUrls > 1) $
logWarning $ "different reference genome assembly URLs given: " ++ show uniqueRefUrls ++
". I will pick the first for the forge output file"
let finalRefName = case uniqueRefNames of
[] -> Nothing
(x:_) -> Just x
finalRefUrl = case uniqueRefUrls of
[] -> Nothing
(x:_) -> Just x
return (finalRefName, finalRefUrl)
2 changes: 1 addition & 1 deletion src/Poseidon/CLI/Genoconvert.hs
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ convertGenoTo outFormat onlyGeno outPath removeOld outPlinkPopMode outZip pac =
GenotypeOutFormatPlink -> return $
GenotypePlink (outFilesRel !! 0) Nothing (outFilesRel !! 1) Nothing (outFilesRel !! 2) Nothing
GenotypeOutFormatVCF -> return $ GenotypeVCF (outFilesRel !! 0) Nothing
let newGenotypeData = GenotypeDataSpec gFileSpec (genotypeSnpSet . posPacGenotypeData $ pac)
let newGenotypeData = GenotypeDataSpec gFileSpec (genotypeSnpSet . posPacGenotypeData $ pac) Nothing Nothing
newPac = pac { posPacGenotypeData = newGenotypeData }
logInfo $ "Adjusting POSEIDON.yml for " ++ show (posPacNameAndVersion pac)
liftIO $ writePoseidonPackage newPac
Expand Down
4 changes: 2 additions & 2 deletions src/Poseidon/CLI/OptparseApplicativeParsers.hs
Original file line number Diff line number Diff line change
Expand Up @@ -463,10 +463,10 @@ parseBasePath = OP.strOption (
OP.help "A base directory to search for Poseidon packages.")

parseInGenoWithoutSNPSet :: OP.Parser GenotypeDataSpec
parseInGenoWithoutSNPSet = GenotypeDataSpec <$> (parseInGenoOne <|> parseInGenoSep) <*> pure Nothing
parseInGenoWithoutSNPSet = GenotypeDataSpec <$> (parseInGenoOne <|> parseInGenoSep) <*> pure Nothing <*> pure Nothing <*> pure Nothing

parseInGenotypeDataset :: OP.Parser GenotypeDataSpec
parseInGenotypeDataset = GenotypeDataSpec <$> (parseInGenoOne <|> parseInGenoSep) <*> (Just <$> parseGenotypeSNPSet)
parseInGenotypeDataset = GenotypeDataSpec <$> (parseInGenoOne <|> parseInGenoSep) <*> (Just <$> parseGenotypeSNPSet) <*> pure Nothing <*> pure Nothing

parseInGenoOne :: OP.Parser GenotypeFileSpec
parseInGenoOne = OP.option (OP.eitherReader readGenoInput) (
Expand Down
31 changes: 21 additions & 10 deletions src/Poseidon/GenotypeData.hs
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,10 @@ data GenoDataSource = PacBaseDir
deriving Show

data GenotypeDataSpec = GenotypeDataSpec {
genotypeFileSpec :: GenotypeFileSpec,
genotypeSnpSet :: Maybe SNPSetSpec
genotypeFileSpec :: GenotypeFileSpec,
genotypeSnpSet :: Maybe SNPSetSpec,
genotypeRefAssemblyName :: Maybe String,
genotypeRefAssemblyURL :: Maybe String
} deriving (Show, Eq)

data GenotypeFileSpec = GenotypeEigenstrat {
Expand Down Expand Up @@ -122,11 +124,13 @@ instance FromJSON GenotypeDataSpec where
<*> v .:? "genoFileChkSum"
_ -> fail ("unknown format " ++ T.unpack gformat)
snpSet <- v .:? "snpSet"
return $ GenotypeDataSpec gfileSpec snpSet
refName <- v .:? "referenceGenomeAssembly"
refURL <- v .:? "referenceGenomeAssemblyURL"
return $ GenotypeDataSpec gfileSpec snpSet refName refURL

instance ToJSON GenotypeDataSpec where
-- this encodes directly to a bytestring Builder
toJSON (GenotypeDataSpec gfileSpec snpSet) = case gfileSpec of
toJSON (GenotypeDataSpec gfileSpec snpSet refName refURL) = case gfileSpec of
GenotypeEigenstrat genoF genoFchk snpF snpFchk indF indFchk ->
object [
"format" .= ("EIGENSTRAT" :: String),
Expand All @@ -136,7 +140,9 @@ instance ToJSON GenotypeDataSpec where
"snpFileChkSum" .= snpFchk,
"indFile" .= indF,
"indFileChkSum" .= indFchk,
"snpSet" .= snpSet
"snpSet" .= snpSet,
"referenceGenomeAssembly" .= refName,
"referenceGenomeAssemblyURL" .= refURL
]
GenotypePlink genoF genoFchk snpF snpFchk indF indFchk ->
object [
Expand All @@ -147,13 +153,18 @@ instance ToJSON GenotypeDataSpec where
"snpFileChkSum" .= snpFchk,
"indFile" .= indF,
"indFileChkSum" .= indFchk,
"snpSet" .= snpSet
"snpSet" .= snpSet,
"referenceGenomeAssembly" .= refName,
"referenceGenomeAssemblyURL" .= refURL
]
GenotypeVCF genoF genoFchk ->
object [
"format" .= ("VCF" :: String),
"genoFile" .= genoF,
"genoFileChkSum".= genoFchk
"genoFileChkSum".= genoFchk,
"snpSet" .= snpSet,
"referenceGenomeAssembly" .= refName,
"referenceGenomeAssemblyURL" .= refURL
]

data SNPSetSpec = SNPSet1240K
Expand Down Expand Up @@ -196,7 +207,7 @@ snpSetMerge SNPSetHumanOrigins SNPSet1240K False = SNPSet1240K
-- | removes directories of all filenames and returns a tuple of the basename and a modified GenotypeDataSpec with pure filenames
-- In case basedirectories do not match, this function will throw an exception
reduceGenotypeFilepaths :: (MonadThrow m) => GenotypeDataSpec -> m (FilePath, GenotypeDataSpec)
reduceGenotypeFilepaths gd@(GenotypeDataSpec gFileSpec _) = do
reduceGenotypeFilepaths gd@(GenotypeDataSpec gFileSpec _ _ _) = do
(baseDir, newGfileSpec) <- case gFileSpec of
GenotypeEigenstrat genoF _ snpF _ indF _ -> do
let baseDirs = map takeDirectory [genoF, snpF, indF]
Expand All @@ -218,7 +229,7 @@ reduceGenotypeFilepaths gd@(GenotypeDataSpec gFileSpec _) = do
loadIndividuals :: FilePath -- ^ the base directory
-> GenotypeDataSpec -- ^ the Genotype spec
-> PoseidonIO [EigenstratIndEntry] -- ^ the returned list of EigenstratIndEntries.
loadIndividuals d (GenotypeDataSpec gFileSpec _) = do
loadIndividuals d (GenotypeDataSpec gFileSpec _ _ _) = do
popMode <- envInputPlinkMode
case gFileSpec of
GenotypeEigenstrat _ _ _ _ fn _ -> readEigenstratInd (d </> fn)
Expand Down Expand Up @@ -260,7 +271,7 @@ loadGenotypeData :: (MonadSafe m) =>
-> GenotypeDataSpec -- ^ the genotype spec
-> m (Producer (EigenstratSnpEntry, GenoLine) m ())
-- ^ a Producer over the Snp position values and the genotype line.
loadGenotypeData baseDir (GenotypeDataSpec gFileSpec _) =
loadGenotypeData baseDir (GenotypeDataSpec gFileSpec _ _ _) =
case gFileSpec of
GenotypeEigenstrat genoF _ snpF _ indF _ -> snd <$> readEigenstrat (baseDir </> genoF) (baseDir </> snpF) (baseDir </> indF)
GenotypePlink genoF _ snpF _ indF _ -> snd <$> readPlink (baseDir </> genoF) (baseDir </> snpF) (baseDir </> indF)
Expand Down
35 changes: 31 additions & 4 deletions src/Poseidon/Package.hs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ module Poseidon.Package (
PoseidonPackage(..),
PoseidonException(..),
PackageReadOptions (..),
LicenseSpec (..),
findAllPoseidonYmlFiles,
checkJannoIndConsistency,
readPoseidonPackageCollection,
Expand Down Expand Up @@ -130,6 +131,7 @@ data PoseidonYamlStruct = PoseidonYamlStruct
, _posYamlContributor :: [ContributorSpec]
, _posYamlPackageVersion :: Maybe Version
, _posYamlLastModified :: Maybe Day
, _posYamlLicense :: Maybe LicenseSpec
, _posYamlGenotypeData :: GenotypeDataSpec
, _posYamlJannoFile :: Maybe FilePath
, _posYamlJannoFileChkSum :: Maybe String
Expand All @@ -142,6 +144,13 @@ data PoseidonYamlStruct = PoseidonYamlStruct
}
deriving (Show, Eq, Generic)

data LicenseSpec = LicenseSpec
{ licenseName :: String
, licenseURL :: Maybe String
, licenseFile :: Maybe FilePath
}
deriving (Show, Eq, Generic)

poseidonJannoFilePath :: FilePath -> PoseidonYamlStruct -> Maybe FilePath
poseidonJannoFilePath baseDir yml = (baseDir </>) <$> _posYamlJannoFile yml
poseidonSeqSourceFilePath :: FilePath -> PoseidonYamlStruct -> Maybe FilePath
Expand All @@ -161,6 +170,7 @@ instance FromJSON PoseidonYamlStruct where
<*> v .:? "contributor" .!= []
<*> v .:? "packageVersion"
<*> v .:? "lastModified"
<*> v .:? "license"
<*> v .: "genotypeData"
<*> v .:? "jannoFile"
<*> v .:? "jannoFileChkSum"
Expand All @@ -179,6 +189,7 @@ instance ToJSON PoseidonYamlStruct where
(if not $ null (_posYamlContributor x) then ["contributor" .= _posYamlContributor x] else []) ++
["packageVersion" .= _posYamlPackageVersion x,
"lastModified" .= _posYamlLastModified x,
"license" .= _posYamlLicense x,
"genotypeData" .= _posYamlGenotypeData x,
"jannoFile" .= _posYamlJannoFile x,
"jannoFileChkSum" .= _posYamlJannoFileChkSum x,
Expand All @@ -190,6 +201,19 @@ instance ToJSON PoseidonYamlStruct where
"changelogFile" .= _posYamlChangelogFile x
]

instance FromJSON LicenseSpec where
parseJSON = withObject "LicenseSpec" $ \v -> LicenseSpec
<$> v .: "name"
<*> v .:? "url"
<*> v .:? "file"

instance ToJSON LicenseSpec where
toJSON x = object [
"name" .= licenseName x,
"url" .= licenseURL x,
"file" .= licenseFile x
]

instance HasNameAndVersion PoseidonYamlStruct where
getPacName = _posYamlTitle
getPacVersion = _posYamlPackageVersion
Expand All @@ -208,6 +232,7 @@ data PoseidonPackage = PoseidonPackage
-- ^ the contributor(s) of the package
, posPacLastModified :: Maybe Day
-- ^ the optional date of last update
, posPacLicense :: Maybe LicenseSpec
, posPacGenotypeData :: GenotypeDataSpec
-- ^ the paths to the genotype files
, posPacJannoFile :: Maybe FilePath
Expand Down Expand Up @@ -382,7 +407,7 @@ readPoseidonPackage opts ymlPath = do
bs <- liftIO $ B.readFile ymlPath

-- read yml files
yml@(PoseidonYamlStruct ver tit des con pacVer mod_ geno jannoF jannoC seqSourceF seqSourceC bibF bibC readF changeF) <- case decodeEither' bs of
yml@(PoseidonYamlStruct ver tit des con pacVer mod_ lic geno jannoF jannoC seqSourceF seqSourceC bibF bibC readF changeF) <- case decodeEither' bs of
Left err -> throwM $ PoseidonYamlParseException ymlPath err
Right pac -> return pac
checkYML yml
Expand Down Expand Up @@ -431,7 +456,7 @@ readPoseidonPackage opts ymlPath = do
logInfo $ "Trying to parse genotype data for package: " ++ tit

-- create PoseidonPackage
let pac = PoseidonPackage baseDir ver (PacNameAndVersion tit pacVer) des con mod_ geno jannoF janno jannoC seqSourceF seqSource seqSourceC bibF bib bibC readF changeF
let pac = PoseidonPackage baseDir ver (PacNameAndVersion tit pacVer) des con mod_ lic geno jannoF janno jannoC seqSourceF seqSource seqSourceC bibF bib bibC readF changeF

-- validate genotype data
when (not (_readOptIgnoreGeno opts) && _readOptGenoCheck opts) $
Expand Down Expand Up @@ -713,6 +738,7 @@ newMinimalPackageTemplate baseDir name gd = do
, posPacDescription = Nothing
, posPacContributor = []
, posPacLastModified = Nothing
, posPacLicense = Nothing
, posPacGenotypeData = reducedGD
, posPacJannoFile = Nothing
, posPacJanno = mempty
Expand Down Expand Up @@ -790,8 +816,8 @@ newPackageTemplate baseDir name genoData indsOrJanno seqSource bib = do
}

writePoseidonPackage :: PoseidonPackage -> IO ()
writePoseidonPackage (PoseidonPackage baseDir ver nameAndVer des con mod_ geno jannoF _ jannoC seqSourceF _ seqSourceC bibF _ bibFC readF changeF) = do
let yamlPac = PoseidonYamlStruct ver (getPacName nameAndVer) des con (getPacVersion nameAndVer) mod_ geno jannoF jannoC seqSourceF seqSourceC bibF bibFC readF changeF
writePoseidonPackage (PoseidonPackage baseDir ver nameAndVer des con mod_ lic geno jannoF _ jannoC seqSourceF _ seqSourceC bibF _ bibFC readF changeF) = do
let yamlPac = PoseidonYamlStruct ver (getPacName nameAndVer) des con (getPacVersion nameAndVer) mod_ lic geno jannoF jannoC seqSourceF seqSourceC bibF bibFC readF changeF
outF = baseDir </> "POSEIDON.yml"
B.writeFile outF $!! encodePretty opts yamlPac
where
Expand All @@ -807,6 +833,7 @@ writePoseidonPackage (PoseidonPackage baseDir ver nameAndVer des con mod_ geno j
"orcid",
"packageVersion",
"lastModified",
"license",
"genotypeData",
"format",
"genoFile",
Expand Down
6 changes: 3 additions & 3 deletions test/Poseidon/GenotypeDataSpec.hs
Original file line number Diff line number Diff line change
Expand Up @@ -61,16 +61,16 @@ testJoinGenoEntries =
testLoadVCF :: Spec
testLoadVCF = describe "loadIndividuals(VCF)" $ do
it "should correctly read group names and genetic sex from VCF header" $ do
let gSpec = GenotypeDataSpec (GenotypeVCF "geno.vcf" Nothing) Nothing
let gSpec = GenotypeDataSpec (GenotypeVCF "geno.vcf" Nothing) Nothing Nothing Nothing
let baseDir = "test/testDat/testPackages/other_test_packages/Schiffels_2016_vcf/"
fmap (take 3) (testLog $ loadIndividuals baseDir gSpec) `shouldReturn`
[EigenstratIndEntry "XXX001" Male "POP1", EigenstratIndEntry "XXX002" Female "POP2", EigenstratIndEntry "XXX003" Male "POP1"]
it "should throw if encountering wrong number of group names" $ do
let gSpec = GenotypeDataSpec (GenotypeVCF "geno_wrong_groupnames.vcf" Nothing) Nothing
let gSpec = GenotypeDataSpec (GenotypeVCF "geno_wrong_groupnames.vcf" Nothing) Nothing Nothing Nothing
let baseDir = "test/testDat/testGenoFiles"
testLog (loadIndividuals baseDir gSpec) `shouldThrow` groupNameExc
it "should throw if encountering wrong number of genetic sex entries" $ do
let gSpec = GenotypeDataSpec (GenotypeVCF "geno_wrong_sexEntries.vcf" Nothing) Nothing
let gSpec = GenotypeDataSpec (GenotypeVCF "geno_wrong_sexEntries.vcf" Nothing) Nothing Nothing Nothing
let baseDir = "test/testDat/testGenoFiles"
testLog (loadIndividuals baseDir gSpec) `shouldThrow` sexEntryExc
where
Expand Down
16 changes: 13 additions & 3 deletions test/Poseidon/PackageSpec.hs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@ import Poseidon.GenotypeData (GenotypeDataSpec (..),
GenotypeFileSpec (..),
SNPSetSpec (..))
import Poseidon.Janno (createMinimalJanno)
import Poseidon.Package (PackageReadOptions (..),
import Poseidon.Package (LicenseSpec (..),
PackageReadOptions (..),
PoseidonPackage (..),
PoseidonYamlStruct (..),
checkJannoIndConsistency,
Expand Down Expand Up @@ -71,6 +72,9 @@ contributor:
email: schiffels@institute.org
orcid: 0000-0002-1017-9150
packageVersion: 1.0.0
license:
name: CC-BY-4.0
url: https://creativecommons.org/licenses/by/4.0/
lastModified: 2020-02-28
bibFile: sources.bib
genotypeData:
Expand Down Expand Up @@ -100,6 +104,7 @@ truePackageRelPaths = PoseidonYamlStruct {
],
_posYamlPackageVersion = Just $ makeVersion [1, 0, 0],
_posYamlLastModified = Just $ fromGregorian 2020 2 28,
_posYamlLicense = Just $ LicenseSpec "CC-BY-4.0" (Just "https://creativecommons.org/licenses/by/4.0/") Nothing,
_posYamlGenotypeData = GenotypeDataSpec {
genotypeFileSpec = GenotypePlink {
_plGenoFile = "Schiffels_2016.bed",
Expand All @@ -109,7 +114,9 @@ truePackageRelPaths = PoseidonYamlStruct {
_plIndFile = "Schiffels_2016.fam",
_plIndFileChkSum = Nothing
},
genotypeSnpSet = Just SNPSet1240K
genotypeSnpSet = Just SNPSet1240K,
genotypeRefAssemblyName = Nothing,
genotypeRefAssemblyURL = Nothing
},
_posYamlJannoFile = Just "Schiffels_2016.janno",
_posYamlJannoFileChkSum = Nothing,
Expand Down Expand Up @@ -181,6 +188,7 @@ testPoseidonFromYAML = describe "PoseidonPackage.fromYAML" $ do
_posYamlContributor = [],
_posYamlPackageVersion = Nothing,
_posYamlLastModified = Nothing,
_posYamlLicense = Nothing,
_posYamlGenotypeData = GenotypeDataSpec {
genotypeFileSpec = GenotypePlink {
_plGenoFile = "test.bed",
Expand All @@ -190,7 +198,9 @@ testPoseidonFromYAML = describe "PoseidonPackage.fromYAML" $ do
_plIndFile = "test.fam",
_plIndFileChkSum = Nothing
},
genotypeSnpSet = Nothing
genotypeSnpSet = Nothing,
genotypeRefAssemblyName = Nothing,
genotypeRefAssemblyURL = Nothing
},
_posYamlJannoFile = Nothing,
_posYamlJannoFileChkSum = Nothing,
Expand Down
4 changes: 2 additions & 2 deletions test/PoseidonGoldenTests/GoldenTestCheckSumFile.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ fd632717ecaf337a39cfd7a828a54e99 init init/Schiffels/Schiffels.janno
9edc4a757f785a8ecb59c54d16c5690a init init/Schiffels/Schiffels.bib
c35421d9be15aa66fa3a3c46df1f746c init init/Wang/POSEIDON.yml
ae66d851301f4a761b819f97ec28fa55 init init/Wang/Wang_2020.bed
956c7bf4c6999cc322ad8407d8bef776 init init_vcf/Schiffels_vcf/POSEIDON.yml
72400156a00aa01e4da7c84a1fcfe829 init init_vcf/Schiffels_vcf/POSEIDON.yml
fd632717ecaf337a39cfd7a828a54e99 init init_vcf/Schiffels_vcf/Schiffels.janno
b088fa0fea0d013ddebacd7b6276fc53 init init_vcf/Schiffels_vcf/geno.vcf
9edc4a757f785a8ecb59c54d16c5690a init init_vcf/Schiffels_vcf/Schiffels.bib
Expand Down Expand Up @@ -130,7 +130,7 @@ ad7e56177aad0a720f0bde13d47f2ac1 forge forge/ForgePac19/CHANGELOG.md
b7b649620cd37bd4a6d6f0f31c1c56da forge forge/ForgePac19/ForgePac19.janno
b36b3ca509c235d0f15571c96195e801 forge forge/ForgePac20/POSEIDON.yml
e375863bca9e4a91c9855396abde31c7 forge forge/ForgePac20/ForgePac20.janno
1f24e4ad0943c830a58e9ae168f9ffa6 forge forge/ForgePac21/POSEIDON.yml
d17a13be042bc19941d3b45fe9e699eb forge forge/ForgePac21/POSEIDON.yml
abdb2335dc85cbd21af5c41db3d8394e forge forge/ForgePac21/ForgePac21.vcf
8846333d9a1de6510f25a3816cc70fef forge forge/ForgePac21/ForgePac21.janno
9089f5d5602937bb7713e1dc8d7a8f2d forge forge/ForgePac21/ForgePac21.ssf
Expand Down
Loading