001// Copyright 2004, 2005 The Apache Software Foundation
002//
003// Licensed under the Apache License, Version 2.0 (the "License");
004// you may not use this file except in compliance with the License.
005// You may obtain a copy of the License at
006//
007//     http://www.apache.org/licenses/LICENSE-2.0
008//
009// Unless required by applicable law or agreed to in writing, software
010// distributed under the License is distributed on an "AS IS" BASIS,
011// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
012// See the License for the specific language governing permissions and
013// limitations under the License.
014
015package org.apache.tapestry.util.text;
016
017/**
018 * An object that encodes a character according to rules of the HTML specification, 
019 * so that it will be properly parsed by a browser irrespectively of the character
020 * encoding used in the HTML output.
021 * 
022 * @author mb
023 * @since 4.0
024 */
025public class MarkupCharacterTranslator implements ICharacterTranslator
026{
027    private static final String SAFE_CHARACTERS =
028        "01234567890"
029            + "abcdefghijklmnopqrstuvwxyz"
030            + "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
031            + "\t\n\r !#$%'()*+,-./:;=?@[\\]^_`{|}~";
032
033    private static final String[][] ENTITIES = {
034        { "\"", """ }, 
035                { "<", "&lt;" },
036                { ">", "&gt;" },
037                { "&", "&amp;" }
038    };
039    
040    private static final ICharacterMatcher SAFE_MATCHER = new AsciiCharacterMatcher(SAFE_CHARACTERS);
041    private static final ICharacterTranslator ENTITY_TRANSLATOR = new AsciiCharacterTranslator(ENTITIES);
042    
043    private boolean _encodeNonAscii;
044    private ICharacterMatcher _safeMatcher;
045    private ICharacterTranslator _entityTranslator;
046        
047    public MarkupCharacterTranslator()
048    {
049        this(true);
050    }
051    
052    public MarkupCharacterTranslator(boolean encodeNonAscii)
053    {
054        this(encodeNonAscii, SAFE_MATCHER, ENTITY_TRANSLATOR);
055    }
056    
057    public MarkupCharacterTranslator(boolean encodeNonAscii, ICharacterMatcher safeMatcher, ICharacterTranslator entityTranslator)
058    {
059        _encodeNonAscii = encodeNonAscii;
060        _safeMatcher = safeMatcher;
061        _entityTranslator = entityTranslator;
062    }
063
064    public MarkupCharacterTranslator(boolean encodeNonAscii, String safeCharacters, String[][] entities)
065    {
066        _encodeNonAscii = encodeNonAscii;
067        _safeMatcher = new AsciiCharacterMatcher(safeCharacters);
068        _entityTranslator = new AsciiCharacterTranslator(entities);
069    }
070    
071        /**
072         * @see org.apache.tapestry.util.text.IMarkupCharacterTranslator#translateAttribute(char)
073         */
074        public String translate(char ch) {
075                // IE and Firefox do not handle characters between 128 and 159 well, 
076                // so they have to be quoted as well 
077                if (ch >= 160 && !_encodeNonAscii) 
078                        return null;
079                
080                if (_safeMatcher.matches(ch))
081                        return null;
082
083                String entity = _entityTranslator.translate(ch);
084                if (entity != null)
085                        return entity;
086                
087                // needs to use a NumberFormat here to be fully compliant, 
088                // but this is accepted fine by the browsers
089                return "&#" + (int) ch + ";";
090        }
091}