/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 * 
 *      http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.portals.applications.webcontent2.rewriter.htmlcleaner;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.IOException;
import java.io.Reader;
import java.io.Writer;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;

import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.ArrayUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.portals.applications.webcontent2.rewriter.ContentRewriter;
import org.apache.portals.applications.webcontent2.rewriter.ContentRewritingContext;
import org.apache.portals.applications.webcontent2.rewriter.ContentRewritingException;
import org.apache.portals.applications.webcontent2.rewriter.Sink;
import org.apache.portals.applications.webcontent2.rewriter.Source;
import org.htmlcleaner.CleanerTransformations;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.Serializer;
import org.htmlcleaner.SimpleHtmlSerializer;
import org.htmlcleaner.TagNode;
import org.htmlcleaner.TagNodeVisitor;
import org.htmlcleaner.XPather;
import org.htmlcleaner.XPatherException;

/**
 * HtmlCleaner based content rewriter implementation.
 */
public class HtmlCleanerContentRewriter implements ContentRewriter
{

    private HtmlCleaner cleaner;
    private SerializerFactory serializerFactory;
    private String sinkEncoding = "UTF-8";
    private String xpathExpression;
    private boolean innerHtmlOnly;
    private String [] transformationInfos;
    private List<TagNodeVisitor> tagNodeVisitors;

    /**
     * Zero-argument default constructor.
     */
    public HtmlCleanerContentRewriter()
    {
    }

    /**
     * Returns {@link SerializerFactory} instance.
     * <p>
     * If no {@link SerializerFactory} was set in prior,
     * then it creates and returns a new instance of {@link DefaultSerializerFactory}
     * with using {@link SimpleHtmlSerializer} by default.
     * </p>
     * @return
     */
    public SerializerFactory getSerializerFactory()
    {
        if (serializerFactory != null) {
            return serializerFactory;
        }

        DefaultSerializerFactory defaultSerializerFactory = new DefaultSerializerFactory();
        defaultSerializerFactory.setSerializerClass(SimpleHtmlSerializer.class);
        return defaultSerializerFactory;
    }

    /**
     * Sets {@link SerializerFactory} property.
     * @param serializerFactory
     */
    public void setSerializerFactory(SerializerFactory serializerFactory)
    {
        this.serializerFactory = serializerFactory;
    }

    /**
     * Returns the character encoding to be used to write to <code>sink</code>.
     * The default return value is 'UTF-8' if not set.
     * @return
     */
    public String getSinkEncoding()
    {
        return sinkEncoding;
    }

    /**
     * Sets the character encoding to be used to write to <code>sink</code>.
     * @param sinkEncoding
     */
    public void setSinkEncoding(String sinkEncoding)
    {
        this.sinkEncoding = sinkEncoding;
    }

    /**
     * Returns the XPath expression to be used to select some filtered tag node(s) only.
     * @return
     */
    public String getXpathExpression()
    {
        return xpathExpression;
    }

    /**
     * Sets the XPath expression to be used to select some filtered tag node(s) only.
     * @param xpathExpression
     */
    public void setXpathExpression(String xpathExpression)
    {
        this.xpathExpression = xpathExpression;
    }

    /**
     * Returns the flag whether or not the output should include the inner HTML(s) only.
     * The default value is false, meaning the outer HTML(s) will be written to <code>sink</code>.
     * @return
     */
    public boolean isInnerHtmlOnly()
    {
        return innerHtmlOnly;
    }

    /**
     * Sets the flag whether or not the output should include the inner HTML(s) only.
     * If the value is false, it means the outer HTML(s) will be written to <code>sink</code>.
     * @param innerHtmlOnly
     */
    public void setInnerHtmlOnly(boolean innerHtmlOnly)
    {
        this.innerHtmlOnly = innerHtmlOnly;
    }

    /**
     * Returns {@link CleanerTransformations} of the underlying <code>HtmlCleaner</code> properties.
     * @return
     */
    public CleanerTransformations getCleanerTransformations()
    {
        return getHtmlCleaner().getProperties().getCleanerTransformations();
    }

    /**
     * Sets {@link CleanerTransformations} of the underlying <code>HtmlCleaner</code> properties.
     * @param cleanerTransformations
     */
    public void setCleanerTransformations(CleanerTransformations cleanerTransformations)
    {
        getHtmlCleaner().getProperties().setCleanerTransformations(cleanerTransformations);
    }

    /**
     * Returns an array of parsing transformation string.
     * <p>
     * Since HtmlCleaner 2.1, it introduces a way to quickly skip specified tags and/or attributes or to transform
     * them to some other tags/attributes during parsing process, avoiding expansive document object model manipulation
     * after cleaning.
     * </p>
     * <p>
     * Here are example transformation rules applied in the cleaning process:
     * </p>
     * <table border="1">
     *   <tr>
     *     <th>Example rule</th>
     *     <th>Explanation</th>
     *   </tr>
     *   <tr>
     *     <td>
     *       cfouput
     *     </td>
     *     <td>
     *       <code>cfouput</code> tag will be ignored by parser (but not content inside)
     *     </td>
     *   </tr>
     *   <tr>
     *     <td>
     *       c:block->div,false
     *     </td>
     *     <td>
     *       <code>c:block</code> tag will be transformed to <code>div</code> tag and
     *       all original attributes will be ignored (false in tranformation description).
     *     </td>
     *   </tr>
     *   <tr>
     *     <td>
     *       font->span,true
     *     </td>
     *     <td>
     *       <code>font</code> tag will be transformed to <code>span</code> and
     *       original attributes will be preserved.
     *     </td>
     *   </tr>
     *   <tr>
     *     <td>
     *       font.size
     *     </td>
     *     <td>
     *       <code>font</code> tag will still be transformed to <code>span</code> and
     *       original attributes will be preserved thanks to the preceding rule added above,
     *       except of the specified <code>size</code> attribute.
     *       The <code>size</code> attribute will be removed.
     *     </td>
     *   </tr>
     *   <tr>
     *     <td>
     *       font.face
     *     </td>
     *     <td>
     *       <code>font</code> tag will still be transformed to <code>span</code> and
     *       original attributes will be preserved thanks to the preceding rule added above,
     *       except of the specified <code>face</code> attribute.
     *       The <code>face</code> attribute will be removed.
     *     </td>
     *   </tr>
     *   <tr>
     *     <td>
     *       font.style=${style};font-family=${face};font-size=${size};
     *     </td>
     *     <td>
     *       <code>font</code> tag will still be transformed to <code>span</code> and
     *       original attributes will be preserved thanks to the preceding rule added above.
     *       <br>
     *       And attribute <code>style</code> has more complex transformation rule:
     *       it will be translated to value given by the template <code>${style};font-family=${face};font-size=${size};</code>.
     *       <br>
     *       So, the <code>style</code> attribute of the original <code>font</code> tag will be prepended to
     *       the <code>style</code> attribute of the new <code>span</code> tag.
     *       <br>
     *       The <code>face</code> attribute of the original <code>font</code> tag will be appended to
     *       <code>style</code> attribute of the new <code>span</code> tag as <code>font-family</code> property.
     *       <br>
     *       The <code>size</code> attribute of the original <code>font</code> tag will be appended to
     *       <code>style</code> attribute of the new <code>span</code> tag as <code>font-size</code> property.
     *       <br>
     *       Template is evaluated against source tag attributes (names between ${ and }).
     *     </td>
     *   </tr>
     * </table>
     * <p>
     * Suppose you have example HTML markups like the following:
     * </p>
     * <pre>
     * ...My content 1...
     * &lt;cfoutput&gt;
     *   Yin and yang describe the polar effects of phenomena.
     * &lt;/cfoutput&gt;
     * ...My content 2...
     * &lt;c:block parent=b1 count=331&gt;
     *   Yin-yang are Mutually Rooted
     * &lt;/c:block&gt;
     * ...My content 3...
     * &lt;font id=f21 size=12 face=Arial style="color:red"&gt;
     *   The Yin and yang aspects are in dynamic equilibrium
     * &lt;/font&gt;
     * ...My content 4...
     * </pre>
     * <p>
     * Based on the example transformation rules shown above,
     * it will be transformed like this:
     * </p>
     * <pre>
     * ...My content 1...
     * Yin and yang describe the polar effects of phenomena.
     * ...My content 2...
     * &lt;div&gt;
     *   Yin-yang are Mutually Rooted
     * &lt;/div&gt;
     * ...My content 3...
     * &lt;span id="f21" style="color:red;font-family=Arial;font-size=12;"&gt;
     *   The Yin and yang aspects are in dynamic equilibrium
     * &lt;/span&gt;
     * ...My content 4...
     * </pre>
     * 
     * @return
     * @see <a href="http://htmlcleaner.sourceforge.net/parameters.php">HtmlCleaner Setting Behavior</a>
     */
    public String [] getCleanerTransformationStringArray()
    {
        if (transformationInfos == null)
        {
            return ArrayUtils.EMPTY_STRING_ARRAY;
        }

        String [] transInfos = new String[transformationInfos.length];
        System.arraycopy(transformationInfos, 0, transInfos, 0, transformationInfos.length);
        return transInfos;
    }

    /**
     * Sets an array of parsing transformation string.
     * See {@link #getCleanerTransformationStringArray()} for details.
     * @param transInfos
     */
    public void setCleanerTransformationStringArray(final String [] transInfos)
    {
        if (transInfos == null)
        {
            transformationInfos = ArrayUtils.EMPTY_STRING_ARRAY;
        }
        else
        {
            transformationInfos = new String[transInfos.length];
            System.arraycopy(transInfos, 0, transformationInfos, 0, transInfos.length);
        }

        Map<String, String> transInfosMap = new TreeMap<String, String>();

        for (String transInfo : transformationInfos) 
        {
            int index = transInfo.indexOf('=');
            String key = index <= 0 ? transInfo : transInfo.substring(0, index);
            String value = index <= 0 ? null : transInfo.substring(index + 1);
            transInfosMap.put(key, value);
        }

        setCleanerTransformations(new CleanerTransformations(transInfosMap));
    }

    /**
     * Returns custom {@link TagNodeVisitor}s which can get the chance to do custom processing on
     * the selected tag nodes.
     * @return
     */
    public List<TagNodeVisitor> getTagNodeVisitors()
    {
        if (tagNodeVisitors == null)
        {
            return Collections.emptyList();
        }

        return Collections.unmodifiableList(tagNodeVisitors);
    }

    /**
     * Sets custom {@link TagNodeVisitor}s which can get the chance to do custom processing on
     * the selected tag nodes.
     */
    public void setTagNodeVisitors(List<TagNodeVisitor> tagNodeVisitors)
    {
        this.tagNodeVisitors = new ArrayList<TagNodeVisitor>();

        if (tagNodeVisitors != null)
        {
            this.tagNodeVisitors.addAll(tagNodeVisitors);
        }
    }

    /**
     * Adds a custom {@link TagNodeVisitor} which can get the chance to do custom processing on
     * the selected tag nodes.
     */
    public void addTagNodeVisitor(TagNodeVisitor tagNodeVisitor)
    {
        if (tagNodeVisitors == null)
        {
            tagNodeVisitors = new ArrayList<TagNodeVisitor>();
        }

        tagNodeVisitors.add(tagNodeVisitor);
    }

    /**
     * Removes the specified custom {@link TagNodeVisitor}.
     */
    public void removeTagNodeVisitor(TagNodeVisitor tagNodeVisitor)
    {
        if (tagNodeVisitors != null)
        {
            tagNodeVisitors.remove(tagNodeVisitor);
        }
    }

    /**
     * Removes all the custom {@link TagNodeVisitor}s.
     */
    public void removeAllTagNodeVisitors()
    {
        if (tagNodeVisitors != null)
        {
            tagNodeVisitors.clear();
        }
    }

    /**
     * {@inheritDoc}
     * <p>
     * This method basically gets a {@link java.io.Reader} from the <code>source</code>
     * and cleans it with an <code>HtmlCleaner</code> retrieved from a {@link #getHtmlCleaner()} call.
     * </p>
     * <p>
     * And, it checks if there's an XPath expression configuration by calling on {@link #getXpathExpression()}.
     * If there's an XPath expression configuration, then it selects only the element(s) filtered by the XPath expression property.
     * Otherwise, it selects the root tag node by default.
     * </p>
     * <p>
     * Afterward, it checks if there's any <code>TagNodeVisitor</code> set by calling on {@link #getTagNodeVisitors()}.
     * If there's any <code>TagNodeVisitor</code>s, then it invokes {@link TagNodeVisitor#visit(TagNode, org.htmlcleaner.HtmlNode)}
     * to give custom <code>TagNodeVisitor</code>s chances to do some custom tag node handling
     * on either filtered tag node(s) or the root tag node if no XPath expression configured.
     * </p>
     * <p>
     * Finally, it serializes the selected tag node(s) to the <code>sink</code>
     * by internally creating a {@link org.htmlcleaner.Serializer} based on other properties configuration.
     * </p>
     */
    public void rewrite(Source source, Sink sink, ContentRewritingContext context) throws ContentRewritingException, IOException
    {
        Serializer serializer = null;

        try
        {
            serializer = getSerializerFactory().createSerializer(getHtmlCleaner());
        } 
        catch (Exception e) 
        {
            throw new ContentRewritingException("Failed to create serializer. " + e, e);
        }

        Reader reader = null;
        BufferedReader br = null;
        Writer writer = null;
        BufferedWriter bw = null;

        try
        {
            reader = source.getReader();
            br = new BufferedReader(reader);
            writer = sink.getWriter();
            bw = new BufferedWriter(writer);

            TagNode rootTagNode = getHtmlCleaner().clean(br);
            List<TagNode> contentTagNodes = null;

            if (StringUtils.isNotEmpty(getXpathExpression()))
            {
                try
                {
                    contentTagNodes = new ArrayList<TagNode>();
                    XPather xpather = new XPather(getXpathExpression());
                    Object[] items = xpather.evaluateAgainstNode(rootTagNode);

                    if (items != null)
                    {
                        for (Object item : items)
                        {
                            if (item instanceof TagNode)
                            {
                                contentTagNodes.add((TagNode) item);
                            }
                        }
                    }
                }
                catch (XPatherException e)
                {
                    throw new ContentRewritingException("Failed to evaluate xpath: " + getXpathExpression() + ". " + e, e);
                }
            }

            if (contentTagNodes != null && !contentTagNodes.isEmpty())
            {
                for (TagNode contentTagNode : contentTagNodes)
                {
                    for (TagNodeVisitor tagNodeVisitor : getTagNodeVisitors())
                    {
                        contentTagNode.traverse(tagNodeVisitor);
                    }
                }

                if (isInnerHtmlOnly())
                {
                    String innerHtml = null;

                    for (TagNode contentTagNode : contentTagNodes)
                    {
                        innerHtml = getHtmlCleaner().getInnerHtml(contentTagNode);
    
                        if (innerHtml != null)
                        {
                            writer.write(innerHtml);
                        }
                    }
                }
                else
                {
                    for (TagNode contentTagNode : contentTagNodes)
                    {
                        serializer.write(contentTagNode, writer, getSinkEncoding());
                    }
                }
            }
            else
            {
                for (TagNodeVisitor tagNodeVisitor : getTagNodeVisitors())
                {
                    rootTagNode.traverse(tagNodeVisitor);
                }

                serializer.write(rootTagNode, writer, getSinkEncoding());
            }
        } 
        finally 
        {
            IOUtils.closeQuietly(br);
            IOUtils.closeQuietly(reader);
            IOUtils.closeQuietly(bw);
            IOUtils.closeQuietly(writer);
        }
    }

    /**
     * Returns the underlying <code>HtmlCleaner</code> instance.
     * If it was not initiated yet, it creates one by invoking {@link #createHtmlCleaner()}.
     * @return
     */
    protected HtmlCleaner getHtmlCleaner()
    {
        if (cleaner == null)
        {
            cleaner = createHtmlCleaner();
        }

        return cleaner;
    }

    /**
     * Creates a <code>HtmlCleaner</code> instance.
     * By default, it sets the following properties to the <code>HtmlCleaner</code>:
     * <ul>
     *   <li>omitXmlDeclaration : true</li>
     *   <li>omitDoctypeDeclaration : true</li>
     * </ul>
     * @return
     */
    protected HtmlCleaner createHtmlCleaner()
    {
        HtmlCleaner htmlCleaner = new HtmlCleaner();
        htmlCleaner.getProperties().setOmitXmlDeclaration(true);
        htmlCleaner.getProperties().setOmitDoctypeDeclaration(true);
        return htmlCleaner;
    }
}
