diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 0000000..e310bcd --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,50 @@ +# This workflow uses actions that are not certified by GitHub. +# They are provided by a third-party and are governed by +# separate terms of service, privacy policy, and support +# documentation. +# This workflow will download a prebuilt Ruby version, install dependencies and run tests with Rake +# For more information see: https://github.com/marketplace/actions/setup-ruby-jruby-and-truffleruby + +name: Ruby + +on: + push: + branches: [ "*" ] + pull_request: + branches: [ "*" ] + +permissions: + contents: read + +jobs: + lint: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v3 + - name: Set up Ruby + uses: ruby/setup-ruby@v1 + with: + ruby-version: "3.1" + bundler-cache: true # runs 'bundle install' and caches installed gems automatically + - name: Run lint + run: bundle exec rubocop + + test: + + runs-on: ubuntu-latest + strategy: + matrix: + ruby-version: ['2.6', '2.7', '3.0', '3.1', '3.2'] + needs: + - lint + + steps: + - uses: actions/checkout@v3 + - name: Set up Ruby + uses: ruby/setup-ruby@v1 + with: + ruby-version: ${{ matrix.ruby-version }} + bundler-cache: true # runs 'bundle install' and caches installed gems automatically + - name: Run tests + run: bundle exec rake diff --git a/.gitignore b/.gitignore index d87d4be..4d78914 100644 --- a/.gitignore +++ b/.gitignore @@ -15,3 +15,8 @@ spec/reports test/tmp test/version_tmp tmp + +# Mac finder artifacts +.DS_Store + +.idea diff --git a/.rubocop.yml b/.rubocop.yml new file mode 100644 index 0000000..2fce36d --- /dev/null +++ b/.rubocop.yml @@ -0,0 +1,27 @@ +Gemspec/RequiredRubyVersion: + Enabled: false + +Layout/LineLength: + Enabled: false +Metrics: + Enabled: false +Naming/ConstantName: + Enabled: false + +Style/FrozenStringLiteralComment: + Enabled: false +Style/Documentation: + Enabled: false +Style/AndOr: + Enabled: false +Style/StringConcatenation: + Enabled: false +Style/ClassAndModuleChildren: + Enabled: false +Style/OptionalBooleanParameter: + Enabled: false +Style/TernaryParentheses: + EnforcedStyle: require_parentheses_when_complex + +Naming/PredicateName: + Enabled: false diff --git a/LICENSE.txt b/LICENSE.txt index 23d448f..6406998 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -1,4 +1,4 @@ -Copyright (c) 2013 TODO: Write your name +Copyright (c) 2017 Ramtin Vaziri https://www.ramtin-vaziri.com MIT License @@ -20,3 +20,5 @@ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +https://ramtin-vaziri.com diff --git a/README.md b/README.md new file mode 100644 index 0000000..28e43c7 --- /dev/null +++ b/README.md @@ -0,0 +1,142 @@ +[![version](https://badge.fury.io/rb/creek.svg)](https://badge.fury.io/rb/creek) +[![downloads](https://img.shields.io/gem/dt/creek)](https://rubygems.org/gems/creek) + +# Creek - Stream parser for large Excel (xlsx and xlsm) files. + +Creek is a Ruby gem that provides a fast, simple and efficient method of parsing large Excel (xlsx and xlsm) files. + + +## Installation + +Creek can be used from the command line or as part of a Ruby web framework. To install the gem using terminal, run the following command: + +``` +gem install creek +``` + +To use it in Rails, add this line to your Gemfile: + +```ruby +gem 'creek' +``` + +## Basic Usage +Creek can simply parse an Excel file by looping through the rows enumerator: + +```ruby +require 'creek' +creek = Creek::Book.new 'spec/fixtures/sample.xlsx' +sheet = creek.sheets[0] + +sheet.rows.each do |row| + puts row # => {"A1"=>"Content 1", "B1"=>nil, "C1"=>nil, "D1"=>"Content 3"} +end + +sheet.simple_rows.each do |row| + puts row # => {"A"=>"Content 1", "B"=>nil, "C"=>nil, "D"=>"Content 3"} +end + +sheet.rows_with_meta_data.each do |row| + puts row # => {"collapsed"=>"false", "customFormat"=>"false", "customHeight"=>"true", "hidden"=>"false", "ht"=>"12.1", "outlineLevel"=>"0", "r"=>"1", "cells"=>{"A1"=>"Content 1", "B1"=>nil, "C1"=>nil, "D1"=>"Content 3"}} +end + +sheet.simple_rows_with_meta_data.each do |row| + puts row # => {"collapsed"=>"false", "customFormat"=>"false", "customHeight"=>"true", "hidden"=>"false", "ht"=>"12.1", "outlineLevel"=>"0", "r"=>"1", "cells"=>{"A"=>"Content 1", "B"=>nil, "C"=>nil, "D"=>"Content 3"}} +end + +sheet.state # => 'visible' +sheet.name # => 'Sheet1' +sheet.rid # => 'rId2' +``` + +## Filename considerations +By default, Creek will ensure that the file extension is either *.xlsx or *.xlsm, but this check can be circumvented as needed: + +```ruby +path = 'sample-as-zip.zip' +Creek::Book.new path, :check_file_extension => false +``` + +By default, the Rails [file_field_tag](http://api.rubyonrails.org/classes/ActionView/Helpers/FormTagHelper.html#method-i-file_field_tag) uploads to a temporary location and stores the original filename with the StringIO object. (See [this section](http://guides.rubyonrails.org/form_helpers.html#uploading-files) of the Rails Guides for more information.) + +Creek can parse this directly without the need for file upload gems such as Carrierwave or Paperclip by passing the original filename as an option: + +```ruby +# Import endpoint in Rails controller +def import + file = params[:file] + Creek::Book.new file.path, check_file_extension: false +end +``` + +## Parsing images +Creek does not parse images by default. If you want to parse the images, +use `with_images` method before iterating over rows to preload images information. If you don't call this method, Creek will not return images anywhere. + +Cells with images will be an array of Pathname objects. +If an image is spread across multiple cells, same Pathname object will be returned for each cell. + +```ruby +sheet.with_images.rows.each do |row| + puts row # => {"A1"=>[#], "B2"=>"Fluffy"} +end +``` + +Images for a specific cell can be obtained with images_at method: + +```ruby +puts sheet.images_at('A1') # => [#] + +# no images in a cell +puts sheet.images_at('C1') # => nil +``` + +Creek will most likely return nil for a cell with images if there is no other text cell in that row - you can use *images_at* method for retrieving images in that cell. + +## Remote files + +```ruby +remote_url = 'http://dev-builds.libreoffice.org/tmp/test.xlsx' +Creek::Book.new remote_url, remote: true +``` + +## Mapping cells with header names +By default, Creek will map cell names with letter and number(A1, B3 and etc). To be able to get cell values by header column name use ***with_headers*** (can be used only with ***#simple_rows*** method!!!) during creation *(Note: header column is first string of sheet)* + +```ruby +creek = Creek::Book.new file.path, with_headers: true +``` + + +## Contributing + +Contributions are welcomed. You can fork a repository, add your code changes to the forked branch, ensure all existing unit tests pass, create new unit tests which cover your new changes and finally create a pull request. + +After forking and then cloning the repository locally, install the Bundler and then use it +to install the development gem dependencies: + +``` +gem install bundler +bundle install +``` + +Once this is complete, you should be able to run the test suite: + +``` +rake +``` + +There are some remote tests that are excluded by default. To run those, run + +``` +bundle exec rspec --tag remote +``` + +## Bug Reporting + +Please use the [Issues](https://github.com/pythonicrubyist/creek/issues) page to report bugs or suggest new enhancements. + + +## License + +Creek has been published under [MIT License](https://github.com/pythonicrubyist/creek/blob/master/LICENSE.txt) diff --git a/README.rdoc b/README.rdoc deleted file mode 100644 index cbdedd8..0000000 --- a/README.rdoc +++ /dev/null @@ -1,76 +0,0 @@ -= Creek -- Stream parser for large Excel(xlsx and xlsm) files. - -Creek is a Ruby gem that provide a fast, simple and efficient method of parsing large Excel(xlsx and xlsm) files. - - -== Installation - -Creek can be used from the command line or as part of a Ruby web framework. To install the gem using terminal, run the following command: - - gem install creek - -To use it in Rails, add this line to your Gemfile: - - gem "creek" - - -== Basic Usage -Creek can simply parse an Excel file by looping through the rows enumerator: - - require 'creek' - creek = Creek::Book.new "specs/fixtures/sample.xlsx" - sheet= creek.sheets[0] - - sheet.rows.each do |row| - puts row # => {"A1"=>"Content 1", "B1"=>nil, C1"=>nil, "D1"=>"Content 3"} - end - - - sheet.rows_with_meta_data.each do |row| - puts row # => {"collapsed"=>"false", "customFormat"=>"false", "customHeight"=>"true", "hidden"=>"false", "ht"=>"12.1", "outlineLevel"=>"0", "r"=>"1", "cells"=>{"A1"=>"Content 1", "B1"=>nil, C1"=>nil, "D1"=>"Content 3"}} - end - - - sheet.state # => 'visible' - sheet.name # => 'Sheet1' - sheet.rid # => 'rId2' - -== Filename considerations -By default, Creek will ensure that the file extension is either *.xlsx or *.xlsm, but this check can be circumvented as needed: - - path = 'sample-as-zip.zip' - Creek::Book.new path, :check_file_extension => false - -By default, the Rails {file_field_tag}[http://api.rubyonrails.org/classes/ActionView/Helpers/FormTagHelper.html#method-i-file_field_tag] uploads to a temporary location and stores the original filename with the StringIO object. (See {this section}[http://guides.rubyonrails.org/form_helpers.html#uploading-files] of the Rails Guides for more information.) - -Creek can parse this directly without the need for file upload gems such as Carrierwave or Paperclip by passing the original filename as an option: - - # Import endpoint in Rails controller - def import - file = params[:file] - Creek::Book.new file.path, check_file_extension: false - end - -== Contributing - -Contributions are welcomed. You can fork a repository, add your code changes to the forked branch, ensure all existing unit tests pass, create new unit tests cover your new changes and finally create a pull request. - -After forking and then cloning the repository locally, install Bundler and then use it -to install the development gem dependecies: - - gem install bundler - bundle install - -Once this is complete, you should be able to run the test suite: - - rake - - -== Bug Reporting - -Please use the {Issues}[https://github.com/pythonicrubyist/creek/issues] page to report bugs or suggest new enhancements. - - -== License - -Creek has been published under {MIT License}[https://github.com/pythonicrubyist/creek/blob/master/LICENSE.txt] diff --git a/Rakefile b/Rakefile index b8fe45b..6f6f4e6 100644 --- a/Rakefile +++ b/Rakefile @@ -1,7 +1,7 @@ -require "bundler/gem_tasks" +require 'bundler/gem_tasks' require 'rspec/core/rake_task' RSpec::Core::RakeTask.new('spec') # If you want to make this the default task -task :default => :spec \ No newline at end of file +task default: :spec diff --git a/creek.gemspec b/creek.gemspec index 1b3c6c8..3fb2ca9 100644 --- a/creek.gemspec +++ b/creek.gemspec @@ -1,30 +1,30 @@ -# coding: utf-8 -lib = File.expand_path('../lib', __FILE__) +lib = File.expand_path('lib', __dir__) $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib) require 'creek/version' Gem::Specification.new do |spec| - spec.name = "creek" + spec.name = 'creek' spec.version = Creek::VERSION - spec.authors = ["pythonicrubyist"] - spec.email = ["pythonicrubyist@gmail.com"] - spec.description = %q{A Ruby gem that streams and parses large Excel(xlsx and xlsm) files fast and efficiently.} - spec.summary = %q{A Ruby gem for parsing large Excel(xlsx and xlsm) files.} - spec.homepage = "https://github.com/pythonicrubyist/creek" - spec.license = "MIT" + spec.authors = ['pythonicrubyist'] + spec.email = ['pythonicrubyist@gmail.com'] + spec.description = 'A Ruby gem that streams and parses large Excel(xlsx and xlsm) files fast and efficiently.' + spec.summary = 'A Ruby gem for parsing large Excel(xlsx and xlsm) files.' + spec.homepage = 'https://github.com/pythonicrubyist/creek' + spec.license = 'MIT' - spec.files = `git ls-files`.split($/) + spec.files = `git ls-files`.split($INPUT_RECORD_SEPARATOR) spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) } spec.test_files = spec.files.grep(%r{^(test|spec|features)/}) - spec.require_paths = ["lib"] + spec.require_paths = ['lib'] - spec.required_ruby_version = '>= 1.9.2' + spec.required_ruby_version = '>= 2.0.0' - spec.add_development_dependency "bundler", "~> 1.3" - spec.add_development_dependency "rake" - spec.add_development_dependency 'rspec', '~> 2.13.0' - spec.add_development_dependency 'pry' + spec.add_development_dependency 'bundler' + spec.add_development_dependency 'pry-byebug' + spec.add_development_dependency 'rake' + spec.add_development_dependency 'rspec', '~> 3.6.0' + spec.add_development_dependency 'rubocop' - spec.add_dependency 'nokogiri', '~> 1.6.0' + spec.add_dependency 'nokogiri', '>= 1.10.0' spec.add_dependency 'rubyzip', '>= 1.0.0' end diff --git a/lib/creek.rb b/lib/creek.rb index 490106c..318d1f6 100644 --- a/lib/creek.rb +++ b/lib/creek.rb @@ -1,9 +1,13 @@ -require "creek/version" +# frozen_string_literal: true + +require 'creek/version' require 'creek/book' require 'creek/styles/constants' require 'creek/styles/style_types' require 'creek/styles/converter' +require 'creek/utils' require 'creek/styles' +require 'creek/drawing' require 'creek/sheet' require 'creek/shared_strings' diff --git a/lib/creek/book.rb b/lib/creek/book.rb index 1501689..157884e 100644 --- a/lib/creek/book.rb +++ b/lib/creek/book.rb @@ -1,32 +1,58 @@ +# frozen_string_literal: true + require 'zip/filesystem' require 'nokogiri' +require 'date' +require 'open-uri' module Creek - class Creek::Book - attr_reader :files, - :sheets, - :shared_strings + :shared_strings, + :with_headers + + DATE_1900 = Date.new(1899, 12, 30).freeze + DATE_1904 = Date.new(1904, 1, 1).freeze - def initialize path, options = {} + def initialize(path, options = {}) check_file_extension = options.fetch(:check_file_extension, true) if check_file_extension extension = File.extname(options[:original_filename] || path).downcase - raise 'Not a valid file format.' unless (['.xlsx', '.xlsm'].include? extension) + raise 'Not a valid file format.' unless ['.xlsx', '.xlsm'].include? extension end - @files = Zip::File.open path + path = download_file(path) if options[:remote] + @files = Zip::File.open(path) @shared_strings = SharedStrings.new(self) + @with_headers = options.fetch(:with_headers, false) end def sheets - doc = @files.file.open "xl/workbook.xml" - xml = Nokogiri::XML::Document.parse doc - rels_doc = @files.file.open "xl/_rels/workbook.xml.rels" - rels = Nokogiri::XML::Document.parse(rels_doc).css("Relationship") - @sheets = xml.css('sheet').map do |sheet| - sheetfile = rels.find { |el| sheet.attr("r:id") == el.attr("Id") }.attr("Target") - Sheet.new(self, sheet.attr("name"), sheet.attr("sheetid"), sheet.attr("state"), sheet.attr("visible"), sheet.attr("r:id"), sheetfile) + @sheets ||= begin + doc = @files.file.open 'xl/workbook.xml' + xml = Nokogiri::XML::Document.parse doc + namespaces = xml.namespaces + + css_prefix = '' + namespaces.each do |namespace| + css_prefix = namespace[0].split(':')[1] + '|' if namespace[1] == 'http://schemas.openxmlformats.org/spreadsheetml/2006/main' && namespace[0] != 'xmlns' + end + + rels_doc = @files.file.open 'xl/_rels/workbook.xml.rels' + rels = Nokogiri::XML::Document.parse(rels_doc).css('Relationship') + xml.css(css_prefix + 'sheet').map do |sheet| + sheetfile = rels.find { |el| sheet.attr('r:id') == el.attr('Id') }.attr('Target') + sheet = Sheet.new( + self, + sheet.attr('name'), + sheet.attr('sheetid'), + sheet.attr('state'), + sheet.attr('visible'), + sheet.attr('r:id'), + sheetfile + ) + sheet.with_headers = with_headers + sheet + end end end @@ -37,5 +63,41 @@ def style_types def close @files.close end + + def base_date + @base_date ||= + begin + # Default to 1900 (minus one day due to excel quirk) but use 1904 if + # it's set in the Workbook's workbookPr + # http://msdn.microsoft.com/en-us/library/ff530155(v=office.12).aspx + result = DATE_1900 # default + + doc = @files.file.open 'xl/workbook.xml' + xml = Nokogiri::XML::Document.parse doc + xml.css('workbookPr[date1904]').each do |workbook_pr| + if workbook_pr['date1904'] =~ /true|1/i + result = DATE_1904 + break + end + end + + result + end + end + + private + + def download_file(url) + # OpenUri will return a StringIO if under OpenURI::Buffer::StringMax + # threshold, and a Tempfile if over. + downloaded = URI(url).open + if downloaded.is_a? StringIO + path = Tempfile.new(['creek-file', '.xlsx']).path + File.binwrite(path, downloaded.read) + path + else + downloaded.path + end + end end end diff --git a/lib/creek/drawing.rb b/lib/creek/drawing.rb new file mode 100644 index 0000000..4df24ce --- /dev/null +++ b/lib/creek/drawing.rb @@ -0,0 +1,116 @@ +# frozen_string_literal: true + +require 'pathname' + +module Creek + class Creek::Drawing + include Creek::Utils + + COLUMNS = ('A'..'AZ').to_a + + def initialize(book, drawing_filepath) + @book = book + @drawing_filepath = drawing_filepath + @drawings = [] + @drawings_rels = [] + @images_pathnames = Hash.new { |hash, key| hash[key] = [] } + + return unless file_exist?(@drawing_filepath) + + load_drawings_and_rels + load_images_pathnames_by_cells if has_images? + end + + ## + # Returns false if there are no images in the drawing file or the drawing file does not exist, true otherwise. + def has_images? + @has_images ||= !@drawings.empty? + end + + ## + # Extracts images from excel to tmpdir for a cell, if the images are not already extracted (multiple calls or same image file in multiple cells). + # Returns array of images as Pathname objects or nil. + def images_at(cell_name) + coordinate = calc_coordinate(cell_name) + pathnames_at_coordinate = @images_pathnames[coordinate] + return if pathnames_at_coordinate.empty? + + pathnames_at_coordinate.map do |image_pathname| + unless image_pathname.exist? + excel_image_path = "xl/media#{image_pathname.to_path.split(tmpdir).last}" + IO.copy_stream(@book.files.file.open(excel_image_path), image_pathname.to_path) + end + image_pathname + end + end + + private + + ## + # Transforms cell name to [row, col], e.g. A1 => [0, 0], B3 => [1, 2] + # Rows and cols start with 0. + def calc_coordinate(cell_name) + col = COLUMNS.index(cell_name.slice(/[A-Z]+/)) + row = cell_name.slice(/\d+/).to_i - 1 # rows in drawings start with 0 + [row, col] + end + + ## + # Creates/loads temporary directory for extracting images from excel + def tmpdir + @tmpdir ||= ::Dir.mktmpdir('creek__drawing') + end + + ## + # Parses drawing and drawing's relationships xmls. + # Drawing xml contains relationships ID's and coordinates (row, col). + # Drawing relationships xml contains images' locations. + def load_drawings_and_rels + @drawings = parse_xml(@drawing_filepath).css('xdr|twoCellAnchor', 'xdr|oneCellAnchor') + drawing_rels_filepath = expand_to_rels_path(@drawing_filepath) + @drawings_rels = parse_xml(drawing_rels_filepath).css('Relationships') + end + + ## + # Iterates through the drawings and saves images' paths as Pathname objects to a hash with [row, col] keys. + # As multiple images can be located in a single cell, hash values are array of Pathname objects. + # One image can be spread across multiple cells (defined with from-row/to-row/from-col/to-col attributes) - same Pathname object is associated to each row-col combination for the range. + def load_images_pathnames_by_cells + image_selector = 'xdr:pic/xdr:blipFill/a:blip' + row_from_selector = 'xdr:from/xdr:row' + row_to_selector = 'xdr:to/xdr:row' + col_from_selector = 'xdr:from/xdr:col' + col_to_selector = 'xdr:to/xdr:col' + + @drawings.xpath('//xdr:twoCellAnchor', '//xdr:oneCellAnchor').each do |drawing| + # embed = drawing.xpath(image_selector).first.attributes['embed'] + temp = drawing.xpath(image_selector).first + embed = temp.attributes['embed'] if temp + next if embed.nil? + + rid = embed.value + path = Pathname.new("#{tmpdir}/#{extract_drawing_path(rid).slice(%r{[^/]*$})}") + + row_from = drawing.xpath(row_from_selector).text.to_i + col_from = drawing.xpath(col_from_selector).text.to_i + + if drawing.name == 'oneCellAnchor' + @images_pathnames[[row_from, col_from]].push(path) + else + row_to = drawing.xpath(row_to_selector).text.to_i + col_to = drawing.xpath(col_to_selector).text.to_i + + (col_from..col_to).each do |col| + (row_from..row_to).each do |row| + @images_pathnames[[row, col]].push(path) + end + end + end + end + end + + def extract_drawing_path(rid) + @drawings_rels.css("Relationship[@Id=#{rid}]").first.attributes['Target'].value + end + end +end diff --git a/lib/creek/shared_strings.rb b/lib/creek/shared_strings.rb index e8baf95..2b32e6b 100644 --- a/lib/creek/shared_strings.rb +++ b/lib/creek/shared_strings.rb @@ -1,24 +1,26 @@ +# frozen_string_literal: true + require 'zip/filesystem' require 'nokogiri' module Creek - class Creek::SharedStrings + SPREADSHEETML_URI = 'http://schemas.openxmlformats.org/spreadsheetml/2006/main' attr_reader :book, :dictionary - def initialize book + def initialize(book) @book = book parse_shared_shared_strings end def parse_shared_shared_strings - path = "xl/sharedStrings.xml" - if @book.files.file.exist?(path) - doc = @book.files.file.open path - xml = Nokogiri::XML::Document.parse doc - parse_shared_string_from_document(xml) - end + path = 'xl/sharedStrings.xml' + return unless @book.files.file.exist?(path) + + doc = @book.files.file.open path + xml = Nokogiri::XML::Document.parse doc + parse_shared_string_from_document(xml) end def parse_shared_string_from_document(xml) @@ -26,19 +28,26 @@ def parse_shared_string_from_document(xml) end def self.parse_shared_string_from_document(xml) - dictionary = Hash.new - - xml.css('si').each_with_index do |si, idx| - text_nodes = si.css('t') - if text_nodes.count == 1 # plain text node - dictionary[idx] = text_nodes.first.content - else # rich text nodes with text fragments - dictionary[idx] = text_nodes.map(&:content).join('') - end + dictionary = {} + namespace = xml.namespaces.detect { |_key, uri| uri == SPREADSHEETML_URI } + prefix = if namespace && namespace[0].start_with?('xmlns:') + namespace[0].delete_prefix('xmlns:') + '|' + else + '' + end + node_selector = "#{prefix}si" + text_selector = ">#{prefix}t, #{prefix}r #{prefix}t" + + xml.css(node_selector).each_with_index do |si, idx| + text_nodes = si.css(text_selector) + dictionary[idx] = if text_nodes.count == 1 # plain text node + Creek::Styles::Converter.unescape_string(text_nodes.first.content) + else # rich text nodes with text fragments + text_nodes.map { |n| Creek::Styles::Converter.unescape_string(n.content) }.join('') + end end dictionary end - end end diff --git a/lib/creek/sheet.rb b/lib/creek/sheet.rb index 780569a..69d8497 100644 --- a/lib/creek/sheet.rb +++ b/lib/creek/sheet.rb @@ -1,19 +1,26 @@ +# frozen_string_literal: true + require 'zip/filesystem' require 'nokogiri' module Creek class Creek::Sheet + include Creek::Utils + + HEADERS_ROW_NUMBER = '1' + SPREADSHEETML_URI = 'http://schemas.openxmlformats.org/spreadsheetml/2006/main' + attr_accessor :with_headers attr_reader :book, :name, :sheetid, :state, :visible, :rid, - :index - + :index, + :headers - def initialize book, name, sheetid, state, visible, rid, sheetfile + def initialize(book, name, sheetid, state, visible, rid, sheetfile) @book = book @name = name @sheetid = sheetid @@ -21,73 +28,125 @@ def initialize book, name, sheetid, state, visible, rid, sheetfile @rid = rid @state = state @sheetfile = sheetfile + @images_present = false + end - # An XLS file has only 256 columns, however, an XLSX or XLSM file can contain up to 16384 columns. - # This function creates a hash with all valid XLSX column names and associated indices. - @excel_col_names = Hash.new - (0...16384).each do |i| - @excel_col_names[col_name(i)] = i + ## + # Preloads images info (coordinates and paths) from related drawing.xml and drawing rels. + # Must be called before #rows method if you want to have images included. + # Returns self so you can chain the calls (sheet.with_images.rows). + def with_images + @drawingfile = extract_drawing_filepath + if @drawingfile + @drawing = Creek::Drawing.new(@book, @drawingfile.sub('..', 'xl')) + @images_present = @drawing.has_images? end + self + end + + ## + # Extracts images for a cell to a temporary folder. + # Returns array of Pathnames for the cell. + # Returns nil if images asre not found for the cell or images were not preloaded with #with_images. + def images_at(cell) + @drawing.images_at(cell) if @images_present + end + + ## + # Provides an Enumerator that returns a hash representing each row. + # The key of the hash is the column ID and the value is the value of the cell. + def simple_rows + rows_generator false, true end ## # Provides an Enumerator that returns a hash representing each row. # The key of the hash is the Cell id and the value is the value of the cell. def rows - rows_generator + rows_generator false, false end ## # Provides an Enumerator that returns a hash representing each row. # The hash contains meta data of the row and a 'cells' embended hash which contains the cell contents. def rows_with_meta_data - rows_generator true + rows_generator true, false end - private ## - # Returns valid Excel column name for a given column index. - # For example, returns "A" for 0, "B" for 1 and "AQ" for 42. - def col_name i - quot = i/26 - (quot>0 ? col_name(quot-1) : "") + (i%26+65).chr + # Provides an Enumerator that returns a hash representing each row. + # The hash contains meta data of the row and a 'cells' embended hash which contains the cell contents. + def simple_rows_with_meta_data + rows_generator true, true end + private + ## # Returns a hash per row that includes the cell ids and values. # Empty cells will be also included in the hash with a nil value. - def rows_generator include_meta_data=false - path = "xl/#{@sheetfile}" - if @book.files.file.exist?(path) - # SAX parsing, Each element in the stream comes through as two events: - # one to open the element and one to close it. - opener = Nokogiri::XML::Reader::TYPE_ELEMENT - closer = Nokogiri::XML::Reader::TYPE_END_ELEMENT - Enumerator.new do |y| - shared, row, cells, cell = false, nil, {}, nil - cell_type = nil - cell_style_idx = nil - @book.files.file.open(path) do |xml| - Nokogiri::XML::Reader.from_io(xml).each do |node| - if (node.name.eql? 'row') and (node.node_type.eql? opener) - row = node.attributes - row['cells'] = Hash.new - cells = Hash.new - y << (include_meta_data ? row : cells) if node.self_closing? - elsif (node.name.eql? 'row') and (node.node_type.eql? closer) - processed_cells = fill_in_empty_cells(cells, row['r'], cell) - row['cells'] = processed_cells - y << (include_meta_data ? row : processed_cells) - elsif (node.name.eql? 'c') and (node.node_type.eql? opener) - cell_type = node.attributes['t'] - cell_style_idx = node.attributes['s'] - cell = node.attributes['r'] - - elsif (node.name.eql? 'v') and (node.node_type.eql? opener) - if !cell.nil? - cells[cell] = convert(node.inner_xml, cell_type, cell_style_idx) + def rows_generator(include_meta_data = false, use_simple_rows_format = false) + path = (@sheetfile.start_with? '/xl/' or @sheetfile.start_with? 'xl/') ? @sheetfile : "xl/#{@sheetfile}" + return unless @book.files.file.exist?(path) + + # SAX parsing, Each element in the stream comes through as two events: + # one to open the element and one to close it. + opener = Nokogiri::XML::Reader::TYPE_ELEMENT + closer = Nokogiri::XML::Reader::TYPE_END_ELEMENT + Enumerator.new do |y| + @headers = nil + row = nil + cells = {} + cell = nil + cell_type = nil + cell_style_idx = nil + @book.files.file.open(path) do |xml| + prefix = '' + name_row = 'row' + name_c = 'c' + name_v = 'v' + name_t = 't' + Nokogiri::XML::Reader.from_io(xml).each do |node| + if prefix.empty? && node.namespaces.any? + namespace = node.namespaces.detect { |_key, uri| uri == SPREADSHEETML_URI } + prefix = if namespace && namespace[0].start_with?('xmlns:') + namespace[0].delete_prefix('xmlns:') + ':' + else + '' + end + name_row = "#{prefix}row" + name_c = "#{prefix}c" + name_v = "#{prefix}v" + name_t = "#{prefix}t" + end + if node.name == name_row && node.node_type == opener + row = node.attributes + row['cells'] = {} + cells = {} + y << (include_meta_data ? row : cells) if node.self_closing? + elsif node.name == name_row && node.node_type == closer + processed_cells = fill_in_empty_cells(cells, row['r'], cell, use_simple_rows_format) + @headers = processed_cells if with_headers && row['r'] == HEADERS_ROW_NUMBER + + if @images_present + processed_cells.each do |cell_name, cell_value| + next unless cell_value.nil? + + processed_cells[cell_name] = images_at(cell_name) end end + + row['cells'] = processed_cells + y << (include_meta_data ? row : processed_cells) + elsif node.name == name_c && node.node_type == opener + cell_type = node.attributes['t'] + cell_style_idx = node.attributes['s'] + cell = node.attributes['r'] + elsif (node.name == name_v || node.name == name_t) && node.node_type == opener + unless cell.nil? + node.read + cells[cell] = convert(node.value, cell_type, cell_style_idx) + end end end end @@ -100,29 +159,49 @@ def convert(value, type, style_idx) end def converter_options - @converter_options ||= {shared_strings: @book.shared_strings.dictionary} + @converter_options ||= { + shared_strings: @book.shared_strings.dictionary, + base_date: @book.base_date + } end ## # The unzipped XML file does not contain any node for empty cells. # Empty cells are being padded in using this function - def fill_in_empty_cells cells, row_number, last_col - new_cells = Hash.new - unless cells.empty? - keys = cells.keys.sort - last_col = last_col.gsub(row_number, '') - last_col_index = @excel_col_names[last_col] - [*(0..last_col_index)].each do |i| - col = col_name i - id = "#{col}#{row_number}" - unless cells.has_key? id - new_cells[id] = nil - else - new_cells[id] = cells[id] - end - end + def fill_in_empty_cells(cells, row_number, last_col, use_simple_rows_format) + new_cells = {} + return new_cells if cells.empty? + + last_col = last_col.gsub(row_number, '') + ('A'..last_col).to_a.each do |column| + id = cell_id(column, use_simple_rows_format, row_number) + new_cells[id] = cells["#{column}#{row_number}"] end + new_cells end + + ## + # Find drawing filepath for the current sheet. + # Sheet xml contains drawing relationship ID. + # Sheet relationships xml contains drawing file's location. + def extract_drawing_filepath + # Read drawing relationship ID from the sheet. + sheet_filepath = "xl/#{@sheetfile}" + drawing = parse_xml(sheet_filepath).css('drawing').first + return if drawing.nil? + + drawing_rid = drawing.attributes['id'].value + + # Read sheet rels to find drawing file's location. + sheet_rels_filepath = expand_to_rels_path(sheet_filepath) + parse_xml(sheet_rels_filepath).css("Relationship[@Id='#{drawing_rid}']").first.attributes['Target'].value + end + + def cell_id(column, use_simple_rows_format, row_number) + return "#{column}#{row_number}" unless use_simple_rows_format + + (with_headers && headers) ? headers[column] : column + end end end diff --git a/lib/creek/styles.rb b/lib/creek/styles.rb index d4681e9..d598c17 100644 --- a/lib/creek/styles.rb +++ b/lib/creek/styles.rb @@ -1,27 +1,26 @@ +# frozen_string_literal: true + module Creek class Styles attr_accessor :book + def initialize(book) @book = book end def path - "xl/styles.xml" + 'xl/styles.xml' end def styles_xml - @styles_xml ||= begin - if @book.files.file.exist?(path) - doc = @book.files.file.open path - Nokogiri::XML::Document.parse doc - end - end + @styles_xml ||= if @book.files.file.exist?(path) + doc = @book.files.file.open path + Nokogiri::XML::Document.parse doc + end end def style_types - @style_types ||= begin - Creek::Styles::StyleTypes.new(styles_xml).call - end + @style_types ||= Creek::Styles::StyleTypes.new(styles_xml).call end end end diff --git a/lib/creek/styles/constants.rb b/lib/creek/styles/constants.rb index 849c0ba..163d9e0 100644 --- a/lib/creek/styles/constants.rb +++ b/lib/creek/styles/constants.rb @@ -1,20 +1,18 @@ -require 'date' - module Creek class Styles module Constants # Map of non-custom numFmtId to casting symbol NumFmtMap = { - 0 => :string, # General - 1 => :fixnum, # 0 - 2 => :float, # 0.00 - 3 => :fixnum, # #,##0 - 4 => :float, # #,##0.00 - 5 => :unsupported, # $#,##0_);($#,##0) - 6 => :unsupported, # $#,##0_);[Red]($#,##0) - 7 => :unsupported, # $#,##0.00_);($#,##0.00) - 8 => :unsupported, # $#,##0.00_);[Red]($#,##0.00) - 9 => :percentage, # 0% + 0 => :string, # General + 1 => :fixnum, # 0 + 2 => :float, # 0.00 + 3 => :fixnum, # #,##0 + 4 => :float, # #,##0.00 + 5 => :unsupported, # $#,##0_);($#,##0) + 6 => :unsupported, # $#,##0_);[Red]($#,##0) + 7 => :unsupported, # $#,##0.00_);($#,##0.00) + 8 => :unsupported, # $#,##0.00_);[Red]($#,##0.00) + 9 => :percentage, # 0% 10 => :percentage, # 0.00% 11 => :bignum, # 0.00E+00 12 => :unsupported, # # ?/? @@ -37,10 +35,7 @@ module Constants 47 => :time, # mmss.0 48 => :bignum, # ##0.0E+0 49 => :unsupported # @ - } - - DATE_SYSTEM_1900 = Date.new(1899, 12, 30) - DATE_SYSTEM_1904 = Date.new(1904, 1, 1) + }.freeze end end end diff --git a/lib/creek/styles/converter.rb b/lib/creek/styles/converter.rb index 59f6714..08d7047 100644 --- a/lib/creek/styles/converter.rb +++ b/lib/creek/styles/converter.rb @@ -1,9 +1,15 @@ +# frozen_string_literal: true + require 'set' module Creek class Styles class Converter include Creek::Styles::Constants + + # Excel non-printable character escape sequence + HEX_ESCAPE_REGEXP = /_x[0-9A-Fa-f]{4}_/.freeze + ## # The heart of typecasting. The ruby type is determined either explicitly # from the cell xml or implicitly from the cell style, and this @@ -23,14 +29,12 @@ class Converter # - shared_strings: needed for 's' (shared string) type # - base_date: from what date to begin, see method #base_date - DATE_TYPES = [:date, :time, :date_time].to_set + DATE_TYPES = %i[date time date_time].to_set def self.call(value, type, style, options = {}) return nil if value.nil? || value.empty? # Sometimes the type is dictated by the style alone - if type.nil? || (type == 'n' && DATE_TYPES.include?(style)) - type = style - end + type = style if type.nil? || (type == 'n' && DATE_TYPES.include?(style)) case type @@ -45,74 +49,86 @@ def self.call(value, type, style, options = {}) when 'b' value.to_i == 1 when 'str' - value + unescape_string(value) when 'inlineStr' - value + unescape_string(value) ## # Type can also be determined by a style, # detected earlier and cast here by its standardized symbol ## - when :string, :unsupported + when :string value + when :unsupported + convert_unknown(value) when :fixnum value.to_i - when :float + when :float, :percentage value.to_f - when :percentage - value.to_f / 100 - when :date, :time, :date_time + when :date convert_date(value, options) + when :time, :date_time + convert_datetime(value, options) when :bignum convert_bignum(value) ## Nothing matched + else + convert_unknown(value) + end + end + + def self.convert_unknown(value) + if value.nil? or value.empty? + value + elsif value.to_i.to_s == value.to_s + value.to_i + elsif value.to_f.to_s == value.to_s + value.to_f else value end + rescue StandardError + value end - # the trickiest. note that all these formats can vary on - # whether they actually contain a date, time, or datetime. def self.convert_date(value, options) - value = value.to_f - days_since_date_system_start = value.to_i - fraction_of_24 = value - days_since_date_system_start + date = base_date(options) + value.to_i + yyyy, mm, dd = date.strftime('%Y-%m-%d').split('-') + + ::Date.new(yyyy.to_i, mm.to_i, dd.to_i) + end - # http://stackoverflow.com/questions/10559767/how-to-convert-ms-excel-date-from-float-to-date-format-in-ruby - date = options.fetch(:base_date, DATE_SYSTEM_1900) + days_since_date_system_start + def self.convert_datetime(value, options) + date = base_date(options) + value.to_f.round(6) - if fraction_of_24 > 0 # there is a time associated - seconds = (fraction_of_24 * 86400).round - return Time.utc(date.year, date.month, date.day) + seconds - else - return date - end + round_datetime(date.strftime('%Y-%m-%d %H:%M:%S.%N')) end def self.convert_bignum(value) if defined?(BigDecimal) - BigDecimal.new(value) + BigDecimal(value) else value.to_f end end - ## Returns the base_date from which to calculate dates. - # Defaults to 1900 (minus two days due to excel quirk), but use 1904 if - # it's set in the Workbook's workbookPr. - # http://msdn.microsoft.com/en-us/library/ff530155(v=office.12).aspx - def base_date - @base_date ||= begin - # return DATE_SYSTEM_1900 if xml.workbook == nil - # xml.workbook.xpath("//workbook/workbookPr[@date1904]").each do |workbookPr| - # return DATE_SYSTEM_1904 if workbookPr["date1904"] =~ /true|1/i - # end - DATE_SYSTEM_1900 - end + def self.unescape_string(value) + # excel encodes some non-printable characters using a hex code in the format _xHHHH_ + # e.g. Carriage Return (\r) is encoded as _x000D_ + value.gsub(HEX_ESCAPE_REGEXP) { |match| match[2, 4].to_i(16).chr(Encoding::UTF_8) } + end + + def self.base_date(options) + options.fetch(:base_date, Date.new(1899, 12, 30)) end + def self.round_datetime(datetime_string) + /(?\d+)-(?\d+)-(?
\d+) (?\d+):(?\d+):(?\d+.\d+)/ =~ datetime_string + + ::Time.new(yyyy.to_i, mm.to_i, dd.to_i, hh.to_i, mi.to_i, ss.to_r).round(0) + end end end end diff --git a/lib/creek/styles/style_types.rb b/lib/creek/styles/style_types.rb index de0db03..0f34dc2 100644 --- a/lib/creek/styles/style_types.rb +++ b/lib/creek/styles/style_types.rb @@ -1,3 +1,5 @@ +# frozen_string_literal: true + # https://github.com/hmcgowan/roo/blob/master/lib/roo/excelx.rb # https://github.com/woahdae/simple_xlsx_reader/blob/master/lib/simple_xlsx_reader.rb#L231 module Creek @@ -5,6 +7,7 @@ class Styles class StyleTypes include Creek::Styles::Constants attr_accessor :styles_xml_doc + def initialize(styles_xml_doc) @styles_xml_doc = styles_xml_doc end @@ -24,17 +27,18 @@ def initialize(styles_xml_doc) # custom). Hence this style types array, rather than a map of numFmtId to # type. def call - @style_types ||= begin - styles_xml_doc.css('styleSheet cellXfs xf').map do |xstyle| - a = num_fmt_id(xstyle) - style_type_by_num_fmt_id( a ) - end + # rubocop:disable Naming/MemoizedInstanceVariableName + @style_types ||= styles_xml_doc.css('styleSheet cellXfs xf').map do |xstyle| + a = num_fmt_id(xstyle) + style_type_by_num_fmt_id(a) end + # rubocop:enable Naming/MemoizedInstanceVariableName end - #returns the numFmtId value if it's available + # returns the numFmtId value if it's available def num_fmt_id(xstyle) return nil unless xstyle.attributes['numFmtId'] + xstyle.attributes['numFmtId'].value end @@ -48,6 +52,7 @@ def num_fmt_id(xstyle) # like a bad idea, but we try to be flexible and just go with it. def style_type_by_num_fmt_id(id) return nil unless id + id = id.to_i NumFmtMap[id] || custom_style_types[id] end @@ -55,13 +60,10 @@ def style_type_by_num_fmt_id(id) # Map of (numFmtId >= 164) (custom styles) to our best guess at the type # ex. {164 => :date_time} def custom_style_types - @custom_style_types ||= begin - styles_xml_doc.css('styleSheet numFmts numFmt').inject({}) do |acc, xstyle| - index = xstyle.attributes['numFmtId'].value.to_i - value = xstyle.attributes['formatCode'].value - acc[index] = determine_custom_style_type(value) - acc - end + @custom_style_types ||= styles_xml_doc.css('styleSheet numFmts numFmt').each_with_object({}) do |xstyle, acc| + index = xstyle.attributes['numFmtId'].value.to_i + value = xstyle.attributes['formatCode'].value + acc[index] = determine_custom_style_type(value) end end @@ -78,7 +80,7 @@ def determine_custom_style_type(string) # Looks for one of ymdhis outside of meta-stuff like [Red] return :date_time if string =~ /(^|\])[^\[]*[ymdhis]/i - return :unsupported + :unsupported end end end diff --git a/lib/creek/utils.rb b/lib/creek/utils.rb new file mode 100644 index 0000000..5b87163 --- /dev/null +++ b/lib/creek/utils.rb @@ -0,0 +1,18 @@ +# frozen_string_literal: true + +module Creek + module Utils + def expand_to_rels_path(filepath) + filepath.sub(%r{(/[^/]+$)}, '/_rels\1.rels') + end + + def file_exist?(path) + @book.files.file.exist?(path) + end + + def parse_xml(xml_path) + doc = @book.files.file.open(xml_path) + Nokogiri::XML::Document.parse(doc) + end + end +end diff --git a/lib/creek/version.rb b/lib/creek/version.rb index ffaf445..7c70ce4 100644 --- a/lib/creek/version.rb +++ b/lib/creek/version.rb @@ -1,3 +1,5 @@ +# frozen_string_literal: true + module Creek - VERSION = "1.1.1" + VERSION = '2.6.3' end diff --git a/spec/.DS_Store b/spec/.DS_Store new file mode 100644 index 0000000..5008ddf Binary files /dev/null and b/spec/.DS_Store differ diff --git a/spec/drawing_spec.rb b/spec/drawing_spec.rb new file mode 100644 index 0000000..2832a5d --- /dev/null +++ b/spec/drawing_spec.rb @@ -0,0 +1,72 @@ +require './spec/spec_helper' + +describe 'drawing' do + let(:book) { Creek::Book.new('spec/fixtures/sample-with-images.xlsx') } + let(:book_no_images) { Creek::Book.new('spec/fixtures/sample.xlsx') } + let(:book_with_one_cell_anchored_images) do + Creek::Book.new('spec/fixtures/sample-with-one-cell-anchored-images.xlsx') + end + let(:drawingfile) { 'xl/drawings/drawing1.xml' } + let(:drawing) { Creek::Drawing.new(book, drawingfile) } + let(:drawing_without_images) { Creek::Drawing.new(book_no_images, drawingfile) } + let(:drawing_with_one_cell_anchored_images) { Creek::Drawing.new(book_with_one_cell_anchored_images, drawingfile) } + + describe '#has_images?' do + it 'has' do + expect(drawing.has_images?).to eq(true) + end + + it 'does not have' do + expect(drawing_without_images.has_images?).to eq(false) + end + end + + describe '#images_at' do + it 'returns images pathnames at cell' do + image = drawing.images_at('A2')[0] + expect(image.class).to eq(Pathname) + expect(image.exist?).to eq(true) + expect(image.to_path).to match(/.+creek__drawing.+\.jpeg$/) + end + + context 'when no images in cell' do + it 'returns nil' do + images = drawing.images_at('B2') + expect(images).to eq(nil) + end + end + + context 'when more images in one cell' do + it 'returns all images at cell' do + images = drawing.images_at('A10') + expect(images.size).to eq(2) + expect(images.all?(&:exist?)).to eq(true) + end + end + + context 'when same image across multiple cells' do + it 'returns same image for each cell' do + image1 = drawing.images_at('A4')[0] + image2 = drawing.images_at('A5')[0] + expect(image1.class).to eq(Pathname) + expect(image1).to eq(image2) + end + end + + context 'when one cell anchored images in cell' do + it 'returns image for anchored cell' do + image = drawing_with_one_cell_anchored_images.images_at('A2')[0] + expect(image.class).to eq(Pathname) + expect(image.exist?).to eq(true) + end + + it 'returns nil for non-anchored cell' do + image = drawing_with_one_cell_anchored_images.images_at('A3')[0] + # Image can be seen present on cell A4 in `sample-with-one-cell-anchored-images.xlsx` + image_at_non_anchored_cell = drawing_with_one_cell_anchored_images.images_at('A4') + expect(image.class).to eq(Pathname) + expect(image_at_non_anchored_cell).to eq(nil) + end + end + end +end diff --git a/spec/fixtures/escaped.xlsx b/spec/fixtures/escaped.xlsx new file mode 100644 index 0000000..a8ee772 Binary files /dev/null and b/spec/fixtures/escaped.xlsx differ diff --git a/spec/fixtures/escaped2.xlsx b/spec/fixtures/escaped2.xlsx new file mode 100644 index 0000000..7a70942 Binary files /dev/null and b/spec/fixtures/escaped2.xlsx differ diff --git a/spec/fixtures/large_numbers.xlsx b/spec/fixtures/large_numbers.xlsx new file mode 100644 index 0000000..ec39d13 Binary files /dev/null and b/spec/fixtures/large_numbers.xlsx differ diff --git a/spec/fixtures/sample-with-headers.xlsx b/spec/fixtures/sample-with-headers.xlsx new file mode 100644 index 0000000..966bc2d Binary files /dev/null and b/spec/fixtures/sample-with-headers.xlsx differ diff --git a/spec/fixtures/sample-with-headers_namespaced.xlsx b/spec/fixtures/sample-with-headers_namespaced.xlsx new file mode 100644 index 0000000..d86f96a Binary files /dev/null and b/spec/fixtures/sample-with-headers_namespaced.xlsx differ diff --git a/spec/fixtures/sample-with-images.xlsx b/spec/fixtures/sample-with-images.xlsx new file mode 100644 index 0000000..d3706e4 Binary files /dev/null and b/spec/fixtures/sample-with-images.xlsx differ diff --git a/spec/fixtures/sample-with-one-cell-anchored-images.xlsx b/spec/fixtures/sample-with-one-cell-anchored-images.xlsx new file mode 100644 index 0000000..4a2f95d Binary files /dev/null and b/spec/fixtures/sample-with-one-cell-anchored-images.xlsx differ diff --git a/spec/fixtures/sample.xlsx b/spec/fixtures/sample.xlsx index 58f6dd3..8310529 100644 Binary files a/spec/fixtures/sample.xlsx and b/spec/fixtures/sample.xlsx differ diff --git a/spec/fixtures/sample_dates.xlsx b/spec/fixtures/sample_dates.xlsx new file mode 100644 index 0000000..e4edea5 Binary files /dev/null and b/spec/fixtures/sample_dates.xlsx differ diff --git a/spec/fixtures/sample_namespaced.xlsx b/spec/fixtures/sample_namespaced.xlsx new file mode 100644 index 0000000..a2f4a40 Binary files /dev/null and b/spec/fixtures/sample_namespaced.xlsx differ diff --git a/spec/fixtures/sheets/sample_dates.xlsx b/spec/fixtures/sheets/sample_dates.xlsx new file mode 100644 index 0000000..0a5d92f Binary files /dev/null and b/spec/fixtures/sheets/sample_dates.xlsx differ diff --git a/spec/fixtures/sheets/single_data_programme.xlsx b/spec/fixtures/sheets/single_data_programme.xlsx new file mode 100644 index 0000000..468537f Binary files /dev/null and b/spec/fixtures/sheets/single_data_programme.xlsx differ diff --git a/spec/fixtures/sst.xml b/spec/fixtures/sst.xml index cd4bd29..2b54051 100644 --- a/spec/fixtures/sst.xml +++ b/spec/fixtures/sst.xml @@ -75,4 +75,17 @@ B2 + + Cell with_x000D_escaped_x000D_characters + + + 吉田兼好 + + ヨシダ + + + ケンコウ + + + \ No newline at end of file diff --git a/spec/fixtures/sst_namespaced.xml b/spec/fixtures/sst_namespaced.xml new file mode 100644 index 0000000..cabf0ed --- /dev/null +++ b/spec/fixtures/sst_namespaced.xml @@ -0,0 +1,91 @@ + + + + Cell A1 + + + Cell B1 + + + My Cell + + + + + + + + + + + Cell + + + + + + + + + + + + + + + + + + + + + A2 + + + + + + + + + + + + Cell + + + + + + + + + + + + + + + + + + + + + B2 + + + + Cell with_x000D_escaped_x000D_characters + + + 吉田兼好 + + ヨシダ + + + ケンコウ + + + + diff --git a/spec/shared_string_spec.rb b/spec/shared_string_spec.rb index b8f5edc..6fb4b88 100644 --- a/spec/shared_string_spec.rb +++ b/spec/shared_string_spec.rb @@ -1,18 +1,35 @@ require './spec/spec_helper' describe 'shared strings' do - it 'parses rich text strings correctly' do shared_strings_xml_file = File.open('spec/fixtures/sst.xml') doc = Nokogiri::XML(shared_strings_xml_file) dictionary = Creek::SharedStrings.parse_shared_string_from_document(doc) - dictionary.keys.size.should == 5 - dictionary[0].should == 'Cell A1' - dictionary[1].should == 'Cell B1' - dictionary[2].should == 'My Cell' - dictionary[3].should == 'Cell A2' - dictionary[4].should == 'Cell B2' + expect(dictionary.keys.size).to eq(7) + expect(dictionary[0]).to eq('Cell A1') + expect(dictionary[1]).to eq('Cell B1') + expect(dictionary[2]).to eq('My Cell') + expect(dictionary[3]).to eq('Cell A2') + expect(dictionary[4]).to eq('Cell B2') + expect(dictionary[5]).to eq("Cell with\rescaped\rcharacters") + expect(dictionary[6]).to eq('吉田兼好') end -end \ No newline at end of file + context 'when the nodes are namespaced' do + it 'parses the dictionary correctly' do + shared_strings_xml_file = File.open('spec/fixtures/sst_namespaced.xml') + doc = Nokogiri::XML(shared_strings_xml_file) + dictionary = Creek::SharedStrings.parse_shared_string_from_document(doc) + + expect(dictionary.keys.size).to eq(7) + expect(dictionary[0]).to eq('Cell A1') + expect(dictionary[1]).to eq('Cell B1') + expect(dictionary[2]).to eq('My Cell') + expect(dictionary[3]).to eq('Cell A2') + expect(dictionary[4]).to eq('Cell B2') + expect(dictionary[5]).to eq("Cell with\rescaped\rcharacters") + expect(dictionary[6]).to eq('吉田兼好') + end + end +end diff --git a/spec/sheet_spec.rb b/spec/sheet_spec.rb new file mode 100644 index 0000000..6ecee45 --- /dev/null +++ b/spec/sheet_spec.rb @@ -0,0 +1,194 @@ +# frozen_string_literal: true + +require './spec/spec_helper' + +describe 'sheet' do + let(:book_with_images) { Creek::Book.new('spec/fixtures/sample-with-images.xlsx') } + let(:sheetfile) { 'worksheets/sheet1.xml' } + let(:sheet_with_images) { Creek::Sheet.new(book_with_images, 'Sheet 1', 1, '', '', '1', sheetfile) } + + def load_cell(rows, cell_name) + cell = rows.find { |row| row[cell_name] } + cell[cell_name] if cell + end + + context 'escaped ampersand' do + let(:book_escaped) { Creek::Book.new('spec/fixtures/escaped.xlsx') } + it 'does NOT escape ampersand' do + expect(book_escaped.sheets[0].rows.to_enum.map(&:values)).to eq([%w[abc def], %w[ghi j&k]]) + end + + let(:book_escaped2) { Creek::Book.new('spec/fixtures/escaped2.xlsx') } + it 'does escape ampersand' do + expect(book_escaped2.sheets[0].rows.to_enum.map(&:values)).to eq([%w[abc def], %w[ghi j&k]]) + end + end + + describe '#rows' do + context 'with excel with images' do + context 'with images preloading' do + let(:rows) { sheet_with_images.with_images.rows.map { |r| r } } + + it 'parses single image in a cell' do + expect(load_cell(rows, 'A2').size).to eq(1) + end + + it 'returns nil for cells without images' do + expect(load_cell(rows, 'A3')).to eq(nil) + expect(load_cell(rows, 'A7')).to eq(nil) + expect(load_cell(rows, 'A9')).to eq(nil) + end + + it 'returns nil for merged cell within empty row' do + expect(load_cell(rows, 'A5')).to eq(nil) + end + + it 'returns nil for image in a cell with empty row' do + expect(load_cell(rows, 'A8')).to eq(nil) + end + + it 'returns images for merged cells' do + expect(load_cell(rows, 'A4').size).to eq(1) + expect(load_cell(rows, 'A6').size).to eq(1) + end + + it 'returns multiple images' do + expect(load_cell(rows, 'A10').size).to eq(2) + end + end + + it 'ignores images' do + rows = sheet_with_images.rows.map { |r| r } + expect(load_cell(rows, 'A2')).to eq(nil) + expect(load_cell(rows, 'A3')).to eq(nil) + expect(load_cell(rows, 'A4')).to eq(nil) + end + end + + context 'when one cell anchored images in cell' do + let(:book_with_one_cell_anchored_images) do + Creek::Book.new('spec/fixtures/sample-with-one-cell-anchored-images.xlsx') + end + let(:sheet_with_one_cell_anchored_images) do + Creek::Sheet.new(book_with_one_cell_anchored_images, 'Sheet 1', 1, '', '', '1', sheetfile) + end + let(:rows) { sheet_with_one_cell_anchored_images.with_images.rows.map { |r| r } } + + it 'returns image for anchored cell' do + expect(load_cell(rows, 'A2').size).to eq(1) + end + + it 'returns nil for non-anchored cell' do + expect(load_cell(rows, 'A4')).to eq(nil) + end + end + + context 'with excel without images' do + let(:book_no_images) { Creek::Book.new('spec/fixtures/sample.xlsx') } + let(:sheet_no_images) { Creek::Sheet.new(book_no_images, 'Sheet 1', 1, '', '', '1', sheetfile) } + + it 'does not break on with_images' do + rows = sheet_no_images.with_images.rows.map { |r| r } + expect(load_cell(rows, 'A10')).to eq(0.15) + end + end + + context 'when nodes are namespaced' do + let(:namespaced_book) { Creek::Book.new('spec/fixtures/sample_namespaced.xlsx') } + let(:namespaced_sheet) { Creek::Sheet.new(namespaced_book, 'Sheet 1', 1, '', '', '1', sheetfile) } + + it 'parses rows correctly' do + rows = namespaced_sheet.rows.map { |r| r } + expect(load_cell(rows, 'A10')).to eq(0.15) + end + end + end + + describe '#images_at' do + it 'returns images for merged cell' do + image = sheet_with_images.with_images.images_at('A5')[0] + expect(image.class).to eq(Pathname) + end + + it 'returns images for empty row' do + image = sheet_with_images.with_images.images_at('A8')[0] + expect(image.class).to eq(Pathname) + end + + it 'returns nil for empty cell' do + image = sheet_with_images.with_images.images_at('B3') + expect(image).to eq(nil) + end + + it 'returns nil for empty cell without preloading images' do + image = sheet_with_images.images_at('B3') + expect(image).to eq(nil) + end + end + + describe '#simple_rows' do + let(:book_with_headers) { Creek::Book.new('spec/fixtures/sample-with-headers.xlsx') } + let(:sheet) { Creek::Sheet.new(book_with_headers, 'Sheet 1', 1, '', '', '1', sheetfile) } + + subject { sheet.simple_rows.to_a[1] } + + it 'returns values by letters' do + expect(subject['A']).to eq 'value1' + expect(subject['B']).to eq 'value2' + end + + context 'when enable with_headers property' do + before { sheet.with_headers = true } + + it 'returns values by headers name' do + expect(subject['HeaderA']).to eq 'value1' + expect(subject['HeaderB']).to eq 'value2' + expect(subject['HeaderC']).to eq 'value3' + end + + it 'returns headers correctly when called multiple times' do + row = sheet.simple_rows.to_a[1] + expect(row['HeaderA']).to eq 'value1' + expect(row['HeaderB']).to eq 'value2' + expect(row['HeaderC']).to eq 'value3' + + row = sheet.simple_rows.to_a[1] + expect(row['HeaderA']).to eq 'value1' + expect(row['HeaderB']).to eq 'value2' + expect(row['HeaderC']).to eq 'value3' + end + end + + context 'when nodes are namespaced' do + let(:namespaced_book) { Creek::Book.new('spec/fixtures/sample-with-headers_namespaced.xlsx') } + let(:sheet) { Creek::Sheet.new(namespaced_book, 'Sheet 1', 1, '', '', '1', sheetfile) } + + it 'returns values by letters' do + expect(subject['A']).to eq 'value1' + expect(subject['B']).to eq 'value2' + end + + context 'when enable with_headers property' do + before { sheet.with_headers = true } + + it 'returns values by headers name' do + expect(subject['HeaderA']).to eq 'value1' + expect(subject['HeaderB']).to eq 'value2' + expect(subject['HeaderC']).to eq 'value3' + end + + it 'returns headers correctly when called multiple times' do + row = sheet.simple_rows.to_a[1] + expect(row['HeaderA']).to eq 'value1' + expect(row['HeaderB']).to eq 'value2' + expect(row['HeaderC']).to eq 'value3' + + row = sheet.simple_rows.to_a[1] + expect(row['HeaderA']).to eq 'value1' + expect(row['HeaderB']).to eq 'value2' + expect(row['HeaderC']).to eq 'value3' + end + end + end + end +end diff --git a/spec/spec_helper.rb b/spec/spec_helper.rb index 13f918c..ed04310 100644 --- a/spec/spec_helper.rb +++ b/spec/spec_helper.rb @@ -1,3 +1,7 @@ require 'creek' require 'pry' +require 'time' +RSpec.configure do |config| + config.filter_run_excluding remote: true +end diff --git a/spec/styles/converter_spec.rb b/spec/styles/converter_spec.rb index f5362ba..ea44e8e 100644 --- a/spec/styles/converter_spec.rb +++ b/spec/styles/converter_spec.rb @@ -1,15 +1,20 @@ require './spec/spec_helper' describe Creek::Styles::Converter do - describe :call do def convert(value, type, style) Creek::Styles::Converter.call(value, type, style) end + describe :date do + it 'works' do + expect(convert('41275', 'n', :date)).to eq(Date.new(2013, 0o1, 0o1)) + end + end + describe :date_time do - it "works" do - convert('41275', 'n', :date_time).should == Date.new(2013,01,01) + it 'works' do + expect(convert('41275', 'n', :date_time)).to eq(Time.new(2013, 0o1, 0o1)) end end end diff --git a/spec/styles/style_types_spec.rb b/spec/styles/style_types_spec.rb index 133c42d..c737908 100644 --- a/spec/styles/style_types_spec.rb +++ b/spec/styles/style_types_spec.rb @@ -1,15 +1,15 @@ require './spec/spec_helper' describe Creek::Styles::StyleTypes do - describe :call do - it "return array of styletypes with mapping to ruby types" do + it 'return array of styletypes with mapping to ruby types' do xml_file = File.open('spec/fixtures/styles/first.xml') doc = Nokogiri::XML(xml_file) res = Creek::Styles::StyleTypes.new(doc).call - res.size.should == 8 - res[3].should == :date_time - res.should == [:unsupported, :unsupported, :unsupported, :date_time, :unsupported, :unsupported, :unsupported, :unsupported] + expect(res.size).to eq(8) + expect(res[3]).to eq(:date_time) + expect(res).to eq(%i[unsupported unsupported unsupported date_time unsupported unsupported + unsupported unsupported]) end end end diff --git a/spec/test_spec.rb b/spec/test_spec.rb index 5baea2f..6ff1fff 100644 --- a/spec/test_spec.rb +++ b/spec/test_spec.rb @@ -2,37 +2,105 @@ describe 'Creek trying to parsing an invalid file.' do it 'Fail to open a legacy xls file.' do - lambda { Creek::Book.new 'spec/fixtures/invalid.xls' }.should raise_error 'Not a valid file format.' + expect { Creek::Book.new 'spec/fixtures/invalid.xls' } + .to raise_error 'Not a valid file format.' end it 'Ignore file extensions on request.' do path = 'spec/fixtures/sample-as-zip.zip' - lambda { Creek::Book.new path, :check_file_extension => false }.should_not raise_error + expect { Creek::Book.new path, check_file_extension: false } + .not_to raise_error end it 'Check file extension when requested.' do - open_book = lambda { Creek::Book.new 'spec/fixtures/invalid.xls', :check_file_extension => true } - open_book.should raise_error 'Not a valid file format.' + expect { Creek::Book.new 'spec/fixtures/invalid.xls', check_file_extension: true } + .to raise_error 'Not a valid file format.' end it 'Check file extension of original_filename if passed.' do path = 'spec/fixtures/temp_string_io_file_path_with_no_extension' - lambda { Creek::Book.new path, :original_filename => 'invalid.xls' }.should raise_error 'Not a valid file format.' - lambda { Creek::Book.new path, :original_filename => 'valid.xlsx' }.should_not raise_error + expect { Creek::Book.new path, original_filename: 'invalid.xls' } + .to raise_error 'Not a valid file format.' + expect { Creek::Book.new path, original_filename: 'valid.xlsx' } + .not_to raise_error + end +end + +describe 'Creek parsing dates on a sample XLSX file' do + before(:all) do + @creek = Creek::Book.new 'spec/fixtures/sample_dates.xlsx' + + @expected_datetime_rows = [ + { 'A3' => 'Date', 'B3' => Date.parse('2018-01-01') }, + { 'A4' => 'Datetime 00:00:00', 'B4' => Time.parse('2018-01-01 00:00:00') }, + { 'A5' => 'Datetime', 'B5' => Time.parse('2018-01-01 23:59:59') } + ] + end + + after(:all) do + @creek.close + end + + it 'parses dates successfully' do + rows = [] + row_count = 0 + @creek.sheets[0].rows.each do |row| + rows << row + row_count += 1 + end + + (2..5).each do |number| + expect(rows[number]).to eq(@expected_datetime_rows[number - 2]) + end + end +end + +describe 'Creek parsing a file with large numbrts.' do + before(:all) do + @creek = Creek::Book.new 'spec/fixtures/large_numbers.xlsx' + @expected_simple_rows = [{ 'A' => '7.83294732E8', 'B' => '783294732', 'C' => 783_294_732.0 }] + end + + after(:all) do + @creek.close end + it 'Parse simple rows successfully.' do + rows = [] + row_count = 0 + @creek.sheets[0].simple_rows.each do |row| + rows << row + row_count += 1 + end + expect(rows[0]).to eq(@expected_simple_rows[0]) + end end describe 'Creek parsing a sample XLSX file' do before(:all) do @creek = Creek::Book.new 'spec/fixtures/sample.xlsx' - @expected_rows = [{'A1'=>'Content 1', 'B1'=>nil, 'C1'=>'Content 2', 'D1'=>nil, 'E1'=>'Content 3'}, - {'A2'=>nil, 'B2'=>'Content 4', 'C2'=>nil, 'D2'=>'Content 5', 'E2'=>nil, 'F2'=>'Content 6'}, - {}, - {'A4'=>'Content 7', 'B4'=>'Content 8', 'C4'=>'Content 9', 'D4'=>'Content 10', 'E4'=>'Content 11', 'F4'=>'Content 12'}, - {'A5'=>nil, 'B5'=>nil, 'C5'=>nil, 'D5'=>nil, 'E5'=>nil, 'F5'=>nil, 'G5'=>nil, 'H5'=>nil, 'I5'=>nil, 'J5'=>nil, 'K5'=>nil, 'L5'=>nil, 'M5'=>nil, 'N5'=>nil, 'O5'=>nil, 'P5'=>nil, 'Q5'=>nil, 'R5'=>nil, 'S5'=>nil, 'T5'=>nil, 'U5'=>nil, 'V5'=>nil, 'W5'=>nil, 'X5'=>nil, 'Y5'=>nil, 'Z5'=>'Z Content', 'AA5'=>nil, 'AB5'=>nil, 'AC5'=>nil, 'AD5'=>nil, 'AE5'=>nil, 'AF5'=>nil, 'AG5'=>nil, 'AH5'=>nil, 'AI5'=>nil, 'AJ5'=>nil, 'AK5'=>nil, 'AL5'=>nil, 'AM5'=>nil, 'AN5'=>nil, 'AO5'=>nil, 'AP5'=>nil, 'AQ5'=>nil, 'AR5'=>nil, 'AS5'=>nil, 'AT5'=>nil, 'AU5'=>nil, 'AV5'=>nil, 'AW5'=>nil, 'AX5'=>nil, 'AY5'=>nil, 'AZ5'=>'Content 13'}, - {'A6'=>'1', 'B6'=>'2', 'C6'=>'3'}, {'A7'=>'Content 15', 'B7'=>'Content 16', 'C7'=>'Content 18', 'D7'=>'Content 19'}, - {'A8'=>nil, 'B8'=>'Content 20', 'C8'=>nil, 'D8'=>nil, 'E8'=>nil, 'F8'=>'Content 21'}] + @expected_rows = [{ 'A1' => 'Content 1', 'B1' => nil, 'C1' => 'Content 2', 'D1' => nil, 'E1' => 'Content 3' }, + { 'A2' => nil, 'B2' => 'Content 4', 'C2' => nil, 'D2' => 'Content 5', 'E2' => nil, 'F2' => 'Content 6' }, + {}, + { 'A4' => 'Content 7', 'B4' => 'Content 8', 'C4' => 'Content 9', 'D4' => 'Content 10', 'E4' => 'Content 11', 'F4' => 'Content 12' }, + { 'A5' => nil, 'B5' => nil, 'C5' => nil, 'D5' => nil, 'E5' => nil, 'F5' => nil, 'G5' => nil, 'H5' => nil, 'I5' => nil, 'J5' => nil, 'K5' => nil, 'L5' => nil, 'M5' => nil, 'N5' => nil, 'O5' => nil, 'P5' => nil, 'Q5' => nil, 'R5' => nil, 'S5' => nil, 'T5' => nil, 'U5' => nil, 'V5' => nil, 'W5' => nil, 'X5' => nil, 'Y5' => nil, 'Z5' => 'Z Content', 'AA5' => nil, 'AB5' => nil, 'AC5' => nil, 'AD5' => nil, 'AE5' => nil, 'AF5' => nil, 'AG5' => nil, 'AH5' => nil, 'AI5' => nil, 'AJ5' => nil, 'AK5' => nil, 'AL5' => nil, 'AM5' => nil, 'AN5' => nil, 'AO5' => nil, 'AP5' => nil, 'AQ5' => nil, 'AR5' => nil, 'AS5' => nil, 'AT5' => nil, 'AU5' => nil, 'AV5' => nil, 'AW5' => nil, 'AX5' => nil, 'AY5' => nil, 'AZ5' => 'Content 13' }, + { 'A6' => '1', 'B6' => '2', 'C6' => '3' }, { 'A7' => 'Content 15', 'B7' => 'Content 16', 'C7' => 'Content 18', 'D7' => 'Content 19' }, + { 'A8' => nil, 'B8' => 'Content 20', 'C8' => nil, 'D8' => nil, 'E8' => nil, 'F8' => 'Content 21' }, + { 'A10' => 0.15, 'B10' => 0.15 }] + + @expected_simple_rows = [{ 'A' => 'Content 1', 'B' => nil, 'C' => 'Content 2', 'D' => nil, 'E' => 'Content 3' }, + { 'A' => nil, 'B' => 'Content 4', 'C' => nil, 'D' => 'Content 5', 'E' => nil, + 'F' => 'Content 6' }, + {}, + { 'A' => 'Content 7', 'B' => 'Content 8', 'C' => 'Content 9', 'D' => 'Content 10', 'E' => 'Content 11', + 'F' => 'Content 12' }, + { 'A' => nil, 'B' => nil, 'C' => nil, 'D' => nil, 'E' => nil, 'F' => nil, 'G' => nil, 'H' => nil, 'I' => nil, + 'J' => nil, 'K' => nil, 'L' => nil, 'M' => nil, 'N' => nil, 'O' => nil, 'P' => nil, 'Q' => nil, 'R' => nil, 'S' => nil, 'T' => nil, 'U' => nil, 'V' => nil, 'W' => nil, 'X' => nil, 'Y' => nil, 'Z' => 'Z Content', 'AA' => nil, 'AB' => nil, 'AC' => nil, 'AD' => nil, 'AE' => nil, 'AF' => nil, 'AG' => nil, 'AH' => nil, 'AI' => nil, 'AJ' => nil, 'AK' => nil, 'AL' => nil, 'AM' => nil, 'AN' => nil, 'AO' => nil, 'AP' => nil, 'AQ' => nil, 'AR' => nil, 'AS' => nil, 'AT' => nil, 'AU' => nil, 'AV' => nil, 'AW' => nil, 'AX' => nil, 'AY' => nil, 'AZ' => 'Content 13' }, + { 'A' => '1', 'B' => '2', 'C' => '3' }, + { 'A' => 'Content 15', 'B' => 'Content 16', 'C' => 'Content 18', 'D' => 'Content 19' }, + { 'A' => nil, 'B' => 'Content 20', 'C' => nil, 'D' => nil, 'E' => nil, + 'F' => 'Content 21' }, + { 'A' => 0.15, 'B' => 0.15 }] end after(:all) do @@ -40,43 +108,65 @@ end it 'open an XLSX file successfully.' do - @creek.should_not be_nil + expect(@creek).not_to be_nil + end + + it 'opens small remote files successfully', remote: true do + url = 'https://file-examples.com/wp-content/uploads/2017/02/file_example_XLSX_10.xlsx' + @creek = Creek::Book.new(url, remote: true) + + expect(@creek.sheets[0]).to be_a Creek::Sheet + end + + it 'opens large remote files successfully', remote: true do + url = 'http://www.house.leg.state.mn.us/comm/docs/BanaianZooExample.xlsx' + @creek = Creek::Book.new(url, remote: true) + + expect(@creek.sheets[0]).to be_a Creek::Sheet end it 'find sheets successfully.' do - @creek.sheets.count.should == 1 + expect(@creek.sheets.count).to eq(1) sheet = @creek.sheets.first - sheet.state.should eql nil - sheet.name.should eql 'Sheet1' - sheet.rid.should eql 'rId1' + expect(sheet.state).to eql nil + expect(sheet.name).to eql 'Sheet1' + expect(sheet.rid).to eql 'rId1' + end + + it 'Parse simple rows successfully.' do + rows = [] + row_count = 0 + @creek.sheets[0].simple_rows.each do |row| + rows << row + row_count += 1 + end + 9.times do |number| + expect(rows[number]).to eq(@expected_simple_rows[number]) + end + expect(row_count).to eq(9) end it 'Parse rows with empty cells successfully.' do - rows = Array.new + rows = [] row_count = 0 @creek.sheets[0].rows.each do |row| rows << row row_count += 1 end - rows[0].should == @expected_rows[0] - rows[1].should == @expected_rows[1] - rows[2].should == @expected_rows[2] - rows[3].should == @expected_rows[3] - rows[4].should == @expected_rows[4] - rows[5].should == @expected_rows[5] - rows[6].should == @expected_rows[6] - rows[7].should == @expected_rows[7] - row_count.should == 8 + 9.times do |number| + expect(rows[number]).to eq(@expected_rows[number]) + end + expect(row_count).to eq(9) end it 'Parse rows with empty cells and meta data successfully.' do - rows = Array.new + rows = [] row_count = 0 @creek.sheets[0].rows_with_meta_data.each do |row| rows << row row_count += 1 end - rows.map{|r| r['cells']}.should == @expected_rows + expect(rows.map { |r| r['cells'] }).to eq(@expected_rows) end end