Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -92,15 +92,16 @@ object TimeSeriesRDD {

private[flint] def convertDfTimestamps(
dataFrame: DataFrame,
timeUnit: TimeUnit
timeUnit: TimeUnit,
timeColumn: String = timeColumnName
): DataFrame = {
if (timeUnit == NANOSECONDS) {
dataFrame
} else {
val converter: Long => Long = TimeUnit.NANOSECONDS.convert(_, timeUnit)
val udfConverter = udf(converter)

dataFrame.withColumn(timeColumnName, udfConverter(col(timeColumnName)))
dataFrame.withColumn(timeColumnName, udfConverter(col(timeColumn)))
}
}

Expand Down Expand Up @@ -246,7 +247,7 @@ object TimeSeriesRDD {
val df = dataFrame.withColumnRenamed(timeColumn, timeColumnName)
requireSchema(df.schema)

val convertedDf = convertDfTimestamps(dataFrame, timeUnit)
val convertedDf = convertDfTimestamps(dataFrame, timeUnit, timeColumn)
// we want to keep time column first, but no code should rely on that
val timeFirstDf = if (convertedDf.schema.fieldIndex(timeColumnName) == 0) {
convertedDf
Expand All @@ -270,7 +271,7 @@ object TimeSeriesRDD {
): TimeSeriesRDD = {
val df = dataFrame.withColumnRenamed(timeColumn, timeColumnName)
requireSchema(df.schema)
val convertedDf = convertDfTimestamps(df, timeUnit)
val convertedDf = convertDfTimestamps(df, timeUnit, timeColumn)

val partitionInfo = PartitionInfo(rangeSplits, deps)
TimeSeriesRDD.fromSortedDfWithPartInfo(convertedDf, Some(partitionInfo))
Expand Down
13 changes: 13 additions & 0 deletions src/test/resources/timeseries/csv/PriceWithHeaderDateColumn.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
date,id,price,info
2017/01/01,7,0.5,test
2017/01/01,3,1,test
2017/01/02,3,1.5,test
2017/01/02,7,2,test
2017/01/03,3,2.5,test
2017/01/03,7,3,test
2017/01/04,3,3.5,test
2017/01/04,7,4,test
2017/01/05,3,4.5,test
2017/01/05,7,5,test
2017/01/08,3,5.5,test
2017/01/08,7,6,test
17 changes: 16 additions & 1 deletion src/test/scala/com/twosigma/flint/timeseries/CSVSpec.scala
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,12 @@

package com.twosigma.flint.timeseries

import java.time.LocalDate
import java.util.TimeZone
import java.util.concurrent.TimeUnit

import com.twosigma.flint.timeseries.row.Schema
import org.scalatest.FlatSpec

import org.apache.spark.sql.types._
import com.twosigma.flint.SharedSparkContext
import com.twosigma.flint.SpecUtils
Expand Down Expand Up @@ -117,4 +118,18 @@ class CSVSpec extends FlatSpec with SharedSparkContext {
assert(first.getAs[Long]("time") == format.parse("2008-01-02 00:00:00.000").getTime * 1000000L)
}
}

it should "correctly read CSV with header, time column not called 'time' and in a specific format" in {
SpecUtils.withResource("/timeseries/csv/PriceWithHeaderDateColumn.csv") { source =>
val timeseriesRdd = CSV.from(sqlContext, "file://" + source, sorted = false, timeColumnName = "date",
dateFormat = "yyyy/MM/dd", header = true)
val first = timeseriesRdd.first()

val format = new java.text.SimpleDateFormat("yyyy-MM-dd HH:mm:ss.S")
assert(first.getAs[Long]("time") == TimeUnit.MILLISECONDS.toNanos(
format.parse("2017-01-01 00:00:00.000").getTime
))
}

}
}