1 /**
  2  * Copyright (C) 2014 KO GmbH <copyright@kogmbh.com>
  3  *
  4  * @licstart
  5  * This file is part of WebODF.
  6  *
  7  * WebODF is free software: you can redistribute it and/or modify it
  8  * under the terms of the GNU Affero General Public License (GNU AGPL)
  9  * as published by the Free Software Foundation, either version 3 of
 10  * the License, or (at your option) any later version.
 11  *
 12  * WebODF is distributed in the hope that it will be useful, but
 13  * WITHOUT ANY WARRANTY; without even the implied warranty of
 14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 15  * GNU Affero General Public License for more details.
 16  *
 17  * You should have received a copy of the GNU Affero General Public License
 18  * along with WebODF.  If not, see <http://www.gnu.org/licenses/>.
 19  * @licend
 20  *
 21  * @source: http://www.webodf.org/
 22  * @source: https://github.com/kogmbh/WebODF/
 23  */
 24 
 25 /*global Node, NodeFilter, core, runtime, odf*/
 26 
 27 /**
 28  * A filter that allows a position if it is in front of a word, picture etc.
 29  *
 30  * Word boundaries are detected by the presence of punctuation as defined in the unicode standard.
 31  * The included UTF categories are:
 32  * - Pc (connector punctuation)
 33  * - Pd (dash punctuation)
 34  * - Pe (close punctuation)
 35  * - Pf (final punctuation)
 36  * - Pi (initial punctuation)
 37  * - Po (other punctuation)
 38  * - Ps (Open punctuation)
 39  *
 40  * In addition, the following ranges are also included as boundaries:
 41  * - 2000-206F (general punctuation)
 42  * - 2E00-2E7F (supplemental punctuation)
 43  * - 3000-303F (CJK symbols and punctuation)
 44  * - 12400-1247F (cuneiform numbers and punctuation)
 45  *
 46  * Some libraries and sites exist for assisting in creation of the regex. The one
 47  * used for this particular expression was http://apps.timwhitlock.info/js/regex
 48  * which is based on the cset javascript library
 49  * (http://inimino.org/~inimino/blog/javascript_cset).
 50  *
 51  *
 52  * @constructor
 53  * @implements {core.PositionFilter}
 54  * @param {!ops.OdtDocument} odtDocument
 55  * @param {!odf.WordBoundaryFilter.IncludeWhitespace} includeWhitespace Specify the type of whitespace to include within
 56  *  the word boundary. TRAILING causes the accepted position to be after the whitespace trailing a word, while LEADING
 57  *  causes the accepted position to be just after the word boundary (but before the trailing whitespace).
 58  */
 59 odf.WordBoundaryFilter = function WordBoundaryFilter(odtDocument, includeWhitespace) {
 60     "use strict";
 61     var TEXT_NODE = Node.TEXT_NODE,
 62         ELEMENT_NODE = Node.ELEMENT_NODE,
 63         odfUtils = odf.OdfUtils,
 64         // Sourced from http://apps.timwhitlock.info/js/regex, including all punctuation components
 65         punctuation = /[!-#%-*,-\/:-;?-@\[-\]_{}¡«·»¿;·՚-՟։-֊־׀׃׆׳-״؉-؊،-؍؛؞-؟٪-٭۔܀-܍߷-߹।-॥॰෴๏๚-๛༄-༒༺-༽྅࿐-࿔၊-၏჻፡-፨᙭-᙮᚛-᚜᛫-᛭᜵-᜶។-៖៘-៚᠀-᠊᥄-᥅᧞-᧟᨞-᨟᭚-᭠᰻-᰿᱾-᱿\u2000-\u206e⁽-⁾₍-₎〈-〉❨-❵⟅-⟆⟦-⟯⦃-⦘⧘-⧛⧼-⧽⳹-⳼⳾-⳿⸀-\u2e7e\u3000-\u303f゠・꘍-꘏꙳꙾꡴-꡷꣎-꣏꤮-꤯꥟꩜-꩟﴾-﴿︐-︙︰-﹒﹔-﹡﹣﹨﹪-﹫！-＃％-＊，-／：-；？-＠［-］＿｛｝｟-･]|\ud800[\udd00-\udd01\udf9f\udfd0]|\ud802[\udd1f\udd3f\ude50-\ude58]|\ud809[\udc00-\udc7e]/,
 66         spacing = /\s/,
 67         /**@const*/
 68         FILTER_ACCEPT = core.PositionFilter.FilterResult.FILTER_ACCEPT,
 69         /**@const*/
 70         FILTER_REJECT = core.PositionFilter.FilterResult.FILTER_REJECT,
 71         /**@const*/
 72         TRAILING = odf.WordBoundaryFilter.IncludeWhitespace.TRAILING,
 73         /**@const*/
 74         LEADING = odf.WordBoundaryFilter.IncludeWhitespace.LEADING,
 75         /**
 76          * @enum {number}
 77          */
 78         NeighborType = {
 79             NO_NEIGHBOUR:       0,
 80             SPACE_CHAR:         1,
 81             PUNCTUATION_CHAR:   2,
 82             WORD_CHAR:          3,
 83             OTHER:              4
 84         };
 85 
 86     /**
 87      * Returns the first filtered sibling ecountered while travelling up the dom from node until
 88      * before the documentRoot - or null if none is found.
 89      * @param {?Node} node
 90      * @param {!number} direction look for a left sibling when negative - for a right sibling otherwise
 91      * @param {!function(?Node):!number} nodeFilter
 92      * @return {?Node}
 93      */
 94     function findHigherNeighborNode(node, direction, nodeFilter) {
 95         var neighboringNode = null,
 96             rootNode = odtDocument.getRootNode(),
 97             unfilteredCandidate;
 98 
 99         while (node !== rootNode && node !== null && neighboringNode === null) {
100             unfilteredCandidate = (direction < 0) ? node.previousSibling : node.nextSibling;
101             if (nodeFilter(unfilteredCandidate) === NodeFilter.FILTER_ACCEPT) {
102                 neighboringNode = unfilteredCandidate;
103             }
104             node = node.parentNode;
105         }
106 
107         return neighboringNode;
108     }
109 
110     /**
111      * @param {?Node} node
112      * @param {!function():!number} getOffset returns the offset inside the node
113      * @return {!NeighborType}
114      */
115     function typeOfNeighbor(node, getOffset) {
116         var neighboringChar;
117 
118         if (node === null) {
119             return NeighborType.NO_NEIGHBOUR;
120         }
121         if (odfUtils.isCharacterElement(node)) {
122             return NeighborType.SPACE_CHAR;
123         }
124         if (node.nodeType === TEXT_NODE || odfUtils.isTextSpan(node) || odfUtils.isHyperlink(node)) {
125             neighboringChar = node.textContent.charAt(getOffset());
126 
127             if (spacing.test(neighboringChar)) {
128                 return NeighborType.SPACE_CHAR;
129             }
130             if (punctuation.test(neighboringChar)) {
131                 return NeighborType.PUNCTUATION_CHAR;
132             }
133             return NeighborType.WORD_CHAR;
134         }
135         return NeighborType.OTHER;
136     }
137 
138     /**
139      * @param {!core.PositionIterator} iterator
140      * @return {!core.PositionFilter.FilterResult}
141      */
142     this.acceptPosition = function (iterator) {
143         var container = iterator.container(),
144             /**@type{Node}*/
145             leftNode = iterator.leftNode(),
146             rightNode = iterator.rightNode(),
147             // For performance reasons, do not calculate the offset inside the dom until it is necessary
148             getRightCharOffset = iterator.unfilteredDomOffset,
149             getLeftCharOffset = function() {return iterator.unfilteredDomOffset() - 1;},
150             leftNeighborType,
151             rightNeighborType;
152 
153         // If this could be the end of an element node, look for the neighboring node higher in the dom
154         if (container.nodeType === ELEMENT_NODE) {
155             if (rightNode === null) {
156                 rightNode = findHigherNeighborNode(container, 1, iterator.getNodeFilter());
157             }
158             if (leftNode === null) {
159                 leftNode = findHigherNeighborNode(container, -1, iterator.getNodeFilter());
160             }
161         }
162 
163         // If we dont stay inside the container node, the getOffset function needs to be modified so as to
164         // return the offset of the characters just at the beginning/end of the respective neighboring node.
165         if (container !== rightNode) {
166             getRightCharOffset = function() {return 0;};
167         }
168         if (container !== leftNode && leftNode !== null) {
169             getLeftCharOffset = function() {return leftNode.textContent.length - 1;};
170         }
171 
172         leftNeighborType = typeOfNeighbor(leftNode, getLeftCharOffset);
173         rightNeighborType = typeOfNeighbor(rightNode, getRightCharOffset);
174 
175         // Reject if: is between two usual characters (inside word) OR
176         //            is between two punctuation marks OR
177         //            (if including trailing space) is before a spacing and not behind the edge (word ending)
178         //            (if excluding trailing space) is before an edge (word start) and not behind the spacing
179         if ((leftNeighborType === NeighborType.WORD_CHAR    && rightNeighborType === NeighborType.WORD_CHAR) ||
180             (leftNeighborType === NeighborType.PUNCTUATION_CHAR && rightNeighborType === NeighborType.PUNCTUATION_CHAR) ||
181             (includeWhitespace === TRAILING &&
182                 leftNeighborType !== NeighborType.NO_NEIGHBOUR && rightNeighborType === NeighborType.SPACE_CHAR) ||
183             (includeWhitespace === LEADING &&
184                 leftNeighborType === NeighborType.SPACE_CHAR && rightNeighborType !== NeighborType.NO_NEIGHBOUR)) {
185             return FILTER_REJECT;
186         }
187         return FILTER_ACCEPT;
188     };
189 };
190 
191 /**
192  * Type of whitespace to include within the word boundary
193  * @enum {!number}
194  */
195 odf.WordBoundaryFilter.IncludeWhitespace = {
196     /**@const*/None: 0,
197     /**@const*/TRAILING: 1,
198     /**@const*/LEADING: 2
199 };
200