001/*
002    Licensed to the Apache Software Foundation (ASF) under one
003    or more contributor license agreements.  See the NOTICE file
004    distributed with this work for additional information
005    regarding copyright ownership.  The ASF licenses this file
006    to you under the Apache License, Version 2.0 (the
007    "License"); you may not use this file except in compliance
008    with the License.  You may obtain a copy of the License at
009
010       http://www.apache.org/licenses/LICENSE-2.0
011
012    Unless required by applicable law or agreed to in writing,
013    software distributed under the License is distributed on an
014    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
015    KIND, either express or implied.  See the License for the
016    specific language governing permissions and limitations
017    under the License.    
018 */
019package org.apache.wiki.search.tika;
020
021import org.apache.log4j.Logger;
022import org.apache.tika.exception.TikaException;
023import org.apache.tika.metadata.ClimateForcast;
024import org.apache.tika.metadata.CreativeCommons;
025import org.apache.tika.metadata.Database;
026import org.apache.tika.metadata.HttpHeaders;
027import org.apache.tika.metadata.IPTC;
028import org.apache.tika.metadata.Metadata;
029import org.apache.tika.metadata.Office;
030import org.apache.tika.metadata.OfficeOpenXMLCore;
031import org.apache.tika.metadata.PDF;
032import org.apache.tika.metadata.TikaCoreProperties;
033import org.apache.tika.metadata.TikaMetadataKeys;
034import org.apache.tika.parser.AutoDetectParser;
035import org.apache.tika.sax.BodyContentHandler;
036import org.apache.wiki.api.exceptions.ProviderException;
037import org.apache.wiki.attachment.Attachment;
038import org.apache.wiki.attachment.AttachmentManager;
039import org.apache.wiki.search.LuceneSearchProvider;
040import org.xml.sax.ContentHandler;
041import org.xml.sax.SAXException;
042
043import java.io.IOException;
044import java.io.InputStream;
045import java.util.HashSet;
046import java.util.Set;
047
048/**
049 * Search provider that extends {link LuceneSearchProvider} using Apache Tika for indexing attachment content.
050 *
051 * @since 2.11.0
052 * @see <a href="https://issues.apache.org/jira/browse/JSPWIKI-469">JSPWIKI-469</a>
053 */
054public class TikaSearchProvider extends LuceneSearchProvider {
055
056    private static final Logger LOG = Logger.getLogger( TikaSearchProvider.class );
057    AutoDetectParser parser;
058    Set< String > textualMetadataFields;
059
060    public TikaSearchProvider() {
061        parser = new AutoDetectParser();
062
063        // metadata fields that also are indexed
064        textualMetadataFields = new HashSet<>();
065        textualMetadataFields.add( TikaCoreProperties.TITLE.getName() );
066        textualMetadataFields.add( TikaCoreProperties.COMMENTS.getName() );
067        textualMetadataFields.add( TikaCoreProperties.KEYWORDS.getName() );
068        textualMetadataFields.add( TikaCoreProperties.DESCRIPTION.getName() );
069        textualMetadataFields.add( TikaCoreProperties.TYPE.getName() );
070        textualMetadataFields.add( TikaMetadataKeys.RESOURCE_NAME_KEY );
071        textualMetadataFields.add( PDF.DOC_INFO_TITLE.getName() );
072        textualMetadataFields.add( PDF.DOC_INFO_KEY_WORDS.getName() );
073        textualMetadataFields.add( PDF.DOC_INFO_SUBJECT.getName() );
074        textualMetadataFields.add( OfficeOpenXMLCore.SUBJECT.getName() );
075        textualMetadataFields.add( Office.KEYWORDS.getName() );
076        textualMetadataFields.add( TikaCoreProperties.TYPE.getName() );
077        textualMetadataFields.add( HttpHeaders.CONTENT_TYPE );
078        textualMetadataFields.add( IPTC.HEADLINE.getName() );
079        textualMetadataFields.add( Database.COLUMN_NAME.getName() );
080        textualMetadataFields.add( Database.TABLE_NAME.getName() );
081        textualMetadataFields.add( CreativeCommons.WORK_TYPE );
082        textualMetadataFields.add( ClimateForcast.COMMENT );
083        textualMetadataFields.add( ClimateForcast.HISTORY );
084        textualMetadataFields.add( ClimateForcast.INSTITUTION );
085    }
086
087    /**
088     * {@inheritDoc}
089     * @param att Attachment to get content for. Filename extension is used to determine the type of the attachment.
090     * @return String representing the content of the file.
091     */
092    @Override
093    protected String getAttachmentContent( final Attachment att ) {
094        // LOG.debug("indexing "+att.getFileName());
095        final AttachmentManager mgr = getEngine().getAttachmentManager();
096        final StringBuilder out = new StringBuilder();
097
098        try( final InputStream attStream = mgr.getAttachmentStream( att ) ) {
099            final Metadata metadata = new Metadata();
100            metadata.set( TikaMetadataKeys.RESOURCE_NAME_KEY, att.getFileName() );
101
102            final ContentHandler handler = new BodyContentHandler(-1 );
103            // -1 disables the character size limit; otherwise only the first 100.000 characters are indexed
104
105            parser.parse( attStream, handler, metadata );
106            out.append( handler.toString() );
107
108            final String[] names = metadata.names();
109            for( int j = 0; j < names.length; j++ ) {
110                if( textualMetadataFields.contains( names[ j ] ) ) {
111                    out.append( " " ).append( metadata.get( names[ j ] ) );
112                }
113            }
114        } catch( TikaException | SAXException e ) {
115            LOG.error( "Attachment cannot be parsed", e );
116        } catch( ProviderException | IOException e ) {
117            LOG.error( "Attachment cannot be loaded", e );
118        }
119
120        return out.toString();
121    }
122
123}