1 /** 2 * Copyright (C) 2014 KO GmbH <copyright@kogmbh.com> 3 * 4 * @licstart 5 * This file is part of WebODF. 6 * 7 * WebODF is free software: you can redistribute it and/or modify it 8 * under the terms of the GNU Affero General Public License (GNU AGPL) 9 * as published by the Free Software Foundation, either version 3 of 10 * the License, or (at your option) any later version. 11 * 12 * WebODF is distributed in the hope that it will be useful, but 13 * WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 * GNU Affero General Public License for more details. 16 * 17 * You should have received a copy of the GNU Affero General Public License 18 * along with WebODF. If not, see <http://www.gnu.org/licenses/>. 19 * @licend 20 * 21 * @source: http://www.webodf.org/ 22 * @source: https://github.com/kogmbh/WebODF/ 23 */ 24 25 /*global Node, NodeFilter, core, runtime, odf*/ 26 27 /** 28 * A filter that allows a position if it is in front of a word, picture etc. 29 * 30 * Word boundaries are detected by the presence of punctuation as defined in the unicode standard. 31 * The included UTF categories are: 32 * - Pc (connector punctuation) 33 * - Pd (dash punctuation) 34 * - Pe (close punctuation) 35 * - Pf (final punctuation) 36 * - Pi (initial punctuation) 37 * - Po (other punctuation) 38 * - Ps (Open punctuation) 39 * 40 * In addition, the following ranges are also included as boundaries: 41 * - 2000-206F (general punctuation) 42 * - 2E00-2E7F (supplemental punctuation) 43 * - 3000-303F (CJK symbols and punctuation) 44 * - 12400-1247F (cuneiform numbers and punctuation) 45 * 46 * Some libraries and sites exist for assisting in creation of the regex. The one 47 * used for this particular expression was http://apps.timwhitlock.info/js/regex 48 * which is based on the cset javascript library 49 * (http://inimino.org/~inimino/blog/javascript_cset). 50 * 51 * 52 * @constructor 53 * @implements {core.PositionFilter} 54 * @param {!ops.OdtDocument} odtDocument 55 * @param {!odf.WordBoundaryFilter.IncludeWhitespace} includeWhitespace Specify the type of whitespace to include within 56 * the word boundary. TRAILING causes the accepted position to be after the whitespace trailing a word, while LEADING 57 * causes the accepted position to be just after the word boundary (but before the trailing whitespace). 58 */ 59 odf.WordBoundaryFilter = function WordBoundaryFilter(odtDocument, includeWhitespace) { 60 "use strict"; 61 var TEXT_NODE = Node.TEXT_NODE, 62 ELEMENT_NODE = Node.ELEMENT_NODE, 63 odfUtils = odf.OdfUtils, 64 // Sourced from http://apps.timwhitlock.info/js/regex, including all punctuation components 65 punctuation = /[!-#%-*,-\/:-;?-@\[-\]_{}¡«·»¿;·՚-՟։-֊־׀׃׆׳-״؉-؊،-؍؛؞-؟٪-٭۔܀-܍߷-߹।-॥॰෴๏๚-๛༄-༒༺-༽྅࿐-࿔၊-၏჻፡-፨᙭-᙮᚛-᚜᛫-᛭᜵-᜶។-៖៘-៚᠀-᠊᥄-᥅᧞-᧟᨞-᨟᭚-᭠᰻-᰿᱾-᱿\u2000-\u206e⁽-⁾₍-₎〈-〉❨-❵⟅-⟆⟦-⟯⦃-⦘⧘-⧛⧼-⧽⳹-⳼⳾-⳿⸀-\u2e7e\u3000-\u303f゠・꘍-꘏꙳꙾꡴-꡷꣎-꣏꤮-꤯꥟꩜-꩟﴾-﴿︐-︙︰-﹒﹔-﹡﹣﹨﹪-﹫!-#%-*,-/:-;?-@[-]_{}⦅-・]|\ud800[\udd00-\udd01\udf9f\udfd0]|\ud802[\udd1f\udd3f\ude50-\ude58]|\ud809[\udc00-\udc7e]/, 66 spacing = /\s/, 67 /**@const*/ 68 FILTER_ACCEPT = core.PositionFilter.FilterResult.FILTER_ACCEPT, 69 /**@const*/ 70 FILTER_REJECT = core.PositionFilter.FilterResult.FILTER_REJECT, 71 /**@const*/ 72 TRAILING = odf.WordBoundaryFilter.IncludeWhitespace.TRAILING, 73 /**@const*/ 74 LEADING = odf.WordBoundaryFilter.IncludeWhitespace.LEADING, 75 /** 76 * @enum {number} 77 */ 78 NeighborType = { 79 NO_NEIGHBOUR: 0, 80 SPACE_CHAR: 1, 81 PUNCTUATION_CHAR: 2, 82 WORD_CHAR: 3, 83 OTHER: 4 84 }; 85 86 /** 87 * Returns the first filtered sibling ecountered while travelling up the dom from node until 88 * before the documentRoot - or null if none is found. 89 * @param {?Node} node 90 * @param {!number} direction look for a left sibling when negative - for a right sibling otherwise 91 * @param {!function(?Node):!number} nodeFilter 92 * @return {?Node} 93 */ 94 function findHigherNeighborNode(node, direction, nodeFilter) { 95 var neighboringNode = null, 96 rootNode = odtDocument.getRootNode(), 97 unfilteredCandidate; 98 99 while (node !== rootNode && node !== null && neighboringNode === null) { 100 unfilteredCandidate = (direction < 0) ? node.previousSibling : node.nextSibling; 101 if (nodeFilter(unfilteredCandidate) === NodeFilter.FILTER_ACCEPT) { 102 neighboringNode = unfilteredCandidate; 103 } 104 node = node.parentNode; 105 } 106 107 return neighboringNode; 108 } 109 110 /** 111 * @param {?Node} node 112 * @param {!function():!number} getOffset returns the offset inside the node 113 * @return {!NeighborType} 114 */ 115 function typeOfNeighbor(node, getOffset) { 116 var neighboringChar; 117 118 if (node === null) { 119 return NeighborType.NO_NEIGHBOUR; 120 } 121 if (odfUtils.isCharacterElement(node)) { 122 return NeighborType.SPACE_CHAR; 123 } 124 if (node.nodeType === TEXT_NODE || odfUtils.isTextSpan(node) || odfUtils.isHyperlink(node)) { 125 neighboringChar = node.textContent.charAt(getOffset()); 126 127 if (spacing.test(neighboringChar)) { 128 return NeighborType.SPACE_CHAR; 129 } 130 if (punctuation.test(neighboringChar)) { 131 return NeighborType.PUNCTUATION_CHAR; 132 } 133 return NeighborType.WORD_CHAR; 134 } 135 return NeighborType.OTHER; 136 } 137 138 /** 139 * @param {!core.PositionIterator} iterator 140 * @return {!core.PositionFilter.FilterResult} 141 */ 142 this.acceptPosition = function (iterator) { 143 var container = iterator.container(), 144 /**@type{Node}*/ 145 leftNode = iterator.leftNode(), 146 rightNode = iterator.rightNode(), 147 // For performance reasons, do not calculate the offset inside the dom until it is necessary 148 getRightCharOffset = iterator.unfilteredDomOffset, 149 getLeftCharOffset = function() {return iterator.unfilteredDomOffset() - 1;}, 150 leftNeighborType, 151 rightNeighborType; 152 153 // If this could be the end of an element node, look for the neighboring node higher in the dom 154 if (container.nodeType === ELEMENT_NODE) { 155 if (rightNode === null) { 156 rightNode = findHigherNeighborNode(container, 1, iterator.getNodeFilter()); 157 } 158 if (leftNode === null) { 159 leftNode = findHigherNeighborNode(container, -1, iterator.getNodeFilter()); 160 } 161 } 162 163 // If we dont stay inside the container node, the getOffset function needs to be modified so as to 164 // return the offset of the characters just at the beginning/end of the respective neighboring node. 165 if (container !== rightNode) { 166 getRightCharOffset = function() {return 0;}; 167 } 168 if (container !== leftNode && leftNode !== null) { 169 getLeftCharOffset = function() {return leftNode.textContent.length - 1;}; 170 } 171 172 leftNeighborType = typeOfNeighbor(leftNode, getLeftCharOffset); 173 rightNeighborType = typeOfNeighbor(rightNode, getRightCharOffset); 174 175 // Reject if: is between two usual characters (inside word) OR 176 // is between two punctuation marks OR 177 // (if including trailing space) is before a spacing and not behind the edge (word ending) 178 // (if excluding trailing space) is before an edge (word start) and not behind the spacing 179 if ((leftNeighborType === NeighborType.WORD_CHAR && rightNeighborType === NeighborType.WORD_CHAR) || 180 (leftNeighborType === NeighborType.PUNCTUATION_CHAR && rightNeighborType === NeighborType.PUNCTUATION_CHAR) || 181 (includeWhitespace === TRAILING && 182 leftNeighborType !== NeighborType.NO_NEIGHBOUR && rightNeighborType === NeighborType.SPACE_CHAR) || 183 (includeWhitespace === LEADING && 184 leftNeighborType === NeighborType.SPACE_CHAR && rightNeighborType !== NeighborType.NO_NEIGHBOUR)) { 185 return FILTER_REJECT; 186 } 187 return FILTER_ACCEPT; 188 }; 189 }; 190 191 /** 192 * Type of whitespace to include within the word boundary 193 * @enum {!number} 194 */ 195 odf.WordBoundaryFilter.IncludeWhitespace = { 196 /**@const*/None: 0, 197 /**@const*/TRAILING: 1, 198 /**@const*/LEADING: 2 199 }; 200