Skip to content

Commit 30f24db

Browse files
author
yu
committed
not add space character for CJK.
1 parent 5a0105c commit 30f24db

File tree

3 files changed

+32
-3
lines changed

3 files changed

+32
-3
lines changed

qt/src/Utils.cc

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -221,3 +221,19 @@ QString Utils::getSpellingLanguage(const QString& lang) {
221221
}
222222
return syslang;
223223
}
224+
225+
// Unicode blocks http://www.fileformat.info/info/unicode/block/index.htm
226+
bool Utils::spacedWord(const QString& text, bool prevWord) {
227+
short unicode = (prevWord ? text.back() : text.front()).unicode();
228+
// CJK Word
229+
std::vector<std::pair<int, int>> cjkWordRange{{0x2480, 0x303f}, {0x31c0, 0x9fff}
230+
, {0xf900, 0xfaff}, {0xfe30, 0xfe4f}, {0x20000, 0x2fa1f}};
231+
for(int i = 0; i < cjkWordRange.size(); i++) {
232+
if(unicode < cjkWordRange[i].first) {
233+
return true;
234+
} else if(unicode >= cjkWordRange[i].first && unicode <= cjkWordRange[i].second) {
235+
return false;
236+
}
237+
}
238+
return true;
239+
}

qt/src/Utils.hh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,8 @@ QByteArray download(QUrl url, QString& messages, int timeout = 60000);
5353

5454
QString getSpellingLanguage(const QString& lang = QString());
5555

56+
bool spacedWord(const QString& text, bool prevWord);
57+
5658
template<typename T>
5759
class AsyncQueue {
5860
public:

qt/src/hocr/HOCRPdfExporter.cc

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -685,6 +685,8 @@ void HOCRPdfExporter::printChildren(PDFPainter& painter, const HOCRItem* item, c
685685
QString itemClass = item->itemClass();
686686
QRect itemRect = item->bbox();
687687
int childCount = item->children().size();
688+
bool prevSpacedWord, currentSpacedWord;
689+
prevSpacedWord = currentSpacedWord = false;
688690
if(itemClass == "ocr_par" && pdfSettings.uniformizeLineSpacing) {
689691
double yInc = double(itemRect.height()) / childCount;
690692
double y = itemRect.top() + yInc;
@@ -703,15 +705,24 @@ void HOCRPdfExporter::printChildren(PDFPainter& painter, const HOCRItem* item, c
703705
if(pdfSettings.fontSize == -1) {
704706
painter.setFontSize(wordItem->fontSize() * pdfSettings.detectedFontScaling);
705707
}
708+
709+
prevWordRight = wordRect.right();
710+
QString text = wordItem->text();
711+
currentSpacedWord = Utils::spacedWord(text, false);
706712
// If distance from previous word is large, keep the space
707713
if(wordRect.x() - prevWordRight > pdfSettings.preserveSpaceWidth * painter.getAverageCharWidth() / px2pu) {
708714
x = wordRect.x();
715+
} else {
716+
//need space
717+
if(currentSpacedWord && prevSpacedWord ) {
718+
x += painter.getTextWidth(" ") / px2pu;
719+
}
709720
}
710-
prevWordRight = wordRect.right();
711-
QString text = wordItem->text();
721+
712722
double wordBaseline = (x - itemRect.x()) * baseline.first + baseline.second;
713723
painter.drawText(x * px2pu, (y + wordBaseline) * px2pu, text);
714-
x += painter.getTextWidth(text + " ") / px2pu;
724+
x += painter.getTextWidth(text) / px2pu;
725+
prevSpacedWord = Utils::spacedWord(text, true);
715726
}
716727
}
717728
} else if(itemClass == "ocr_line" && !pdfSettings.uniformizeLineSpacing) {

0 commit comments

Comments
 (0)