Skip to content

Improve DOCX conversion to text #5

@daliboris

Description

@daliboris

Plays prepared in Libre Office contains numbers for @w:styleId:

<w:style w:type="paragraph" w:styleId="912" w:customStyle="1">
  <w:name w:val="DraCor standard" />
  <w:qFormat />
  <w:pPr>
   <w:pBdr />
   <w:spacing w:after="120" w:line="360" w:lineRule="auto" />
   <w:ind />
  </w:pPr>
  <w:rPr>
   <w:rFonts w:ascii="Times New Roman" w:hAnsi="Times New Roman" w:cs="Times New Roman" w:eastAsiaTheme="minorHAnsi" />
   <w:sz w:val="24" />
   <w:lang w:eastAsia="en-US" />
   <w14:ligatures w14:val="standardContextual" />
  </w:rPr>
 </w:style>

Plays prepared in MS Word contains text for @w:styleId:

<w:style w:type="paragraph" w:customStyle="1" w:styleId="DraCorstandard">
  <w:name w:val="DraCor standard" />
  <w:qFormat />
  <w:rsid w:val="00735D94" />
  <w:pPr>
   <w:suppressAutoHyphens w:val="0" />
   <w:spacing w:after="120" w:line="360" w:lineRule="auto" />
  </w:pPr>
  <w:rPr>
   <w:rFonts w:ascii="Times New Roman" w:hAnsi="Times New Roman" w:cs="Times New Roman" />
   <w:kern w:val="2" />
   <w:sz w:val="24" />
   <w:szCs w:val="20" />
   <w:lang w:val="la-Latn" />
   <w14:ligatures w14:val="standardContextual" />
  </w:rPr>
 </w:style>

Current conversion uses textual style ids to remove redundand contet:

<xsl:template match="w:document/w:body//w:p[w:pPr/w:pStyle/@w:val='DraCoradditions']" />
<xsl:template match="w:document/w:body//w:r[w:rPr/w:rStyle/@w:val='DraCormarkversepart']" />

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions