diff --git a/swagger.yml b/swagger.yml index bdfd541..f5aafff 100644 --- a/swagger.yml +++ b/swagger.yml @@ -103,6 +103,10 @@ definitions: - vie - chi-sim - chi-tra - - - + output_format: + type: string + description: The format type of output. If omitted, will use simple text + enum: + - tsv + - pdf + - hocr diff --git a/tesseract_engine.go b/tesseract_engine.go index 941f63f..d9629b0 100644 --- a/tesseract_engine.go +++ b/tesseract_engine.go @@ -15,9 +15,10 @@ type TesseractEngine struct { } type TesseractEngineArgs struct { - configVars map[string]string `json:"config_vars"` - pageSegMode string `json:"psm"` - lang string `json:"lang"` + configVars map[string]string `json:"config_vars"` + pageSegMode string `json:"psm"` + lang string `json:"lang"` + outputFormat string `json:"output_format"` } func NewTesseractEngineArgs(ocrRequest OcrRequest) (*TesseractEngineArgs, error) { @@ -70,6 +71,16 @@ func NewTesseractEngineArgs(ocrRequest OcrRequest) (*TesseractEngineArgs, error) engineArgs.lang = langStr } + // output format + outputFormat := ocrRequest.EngineArgs["output_format"] + if outputFormat != nil { + outputFormatStr, ok := outputFormat.(string) + if !ok { + return nil, fmt.Errorf("Could not convert output_format into string: %v", outputFormat) + } + engineArgs.outputFormat = outputFormatStr + } + return engineArgs, nil } @@ -91,6 +102,9 @@ func (t TesseractEngineArgs) Export() []string { result = append(result, "-l") result = append(result, t.lang) } + if t.outputFormat != "" { + result = append(result, t.outputFormat) + } return result } @@ -198,7 +212,7 @@ func (t TesseractEngine) processImageFile(inputFilename string, engineArgs Tesse tmpOutFileBaseName := inputFilename // possible file extensions - fileExtensions := []string{"txt", "hocr"} + fileExtensions := []string{"txt", "hocr", "tsv"} // build args array cflags := engineArgs.Export()