001/* 002 Licensed to the Apache Software Foundation (ASF) under one 003 or more contributor license agreements. See the NOTICE file 004 distributed with this work for additional information 005 regarding copyright ownership. The ASF licenses this file 006 to you under the Apache License, Version 2.0 (the 007 "License"); you may not use this file except in compliance 008 with the License. You may obtain a copy of the License at 009 010 http://www.apache.org/licenses/LICENSE-2.0 011 012 Unless required by applicable law or agreed to in writing, 013 software distributed under the License is distributed on an 014 "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 015 KIND, either express or implied. See the License for the 016 specific language governing permissions and limitations 017 under the License. 018 */ 019package org.apache.wiki.search.tika; 020 021import org.apache.log4j.Logger; 022import org.apache.tika.exception.TikaException; 023import org.apache.tika.metadata.ClimateForcast; 024import org.apache.tika.metadata.CreativeCommons; 025import org.apache.tika.metadata.Database; 026import org.apache.tika.metadata.HttpHeaders; 027import org.apache.tika.metadata.IPTC; 028import org.apache.tika.metadata.Metadata; 029import org.apache.tika.metadata.Office; 030import org.apache.tika.metadata.OfficeOpenXMLCore; 031import org.apache.tika.metadata.PDF; 032import org.apache.tika.metadata.TikaCoreProperties; 033import org.apache.tika.metadata.TikaMetadataKeys; 034import org.apache.tika.parser.AutoDetectParser; 035import org.apache.tika.sax.BodyContentHandler; 036import org.apache.wiki.api.exceptions.ProviderException; 037import org.apache.wiki.attachment.Attachment; 038import org.apache.wiki.attachment.AttachmentManager; 039import org.apache.wiki.search.LuceneSearchProvider; 040import org.xml.sax.ContentHandler; 041import org.xml.sax.SAXException; 042 043import java.io.IOException; 044import java.io.InputStream; 045import java.util.HashSet; 046import java.util.Set; 047 048/** 049 * Search provider that extends {link LuceneSearchProvider} using Apache Tika for indexing attachment content. 050 * 051 * @since 2.11.0 052 * @see <a href="https://issues.apache.org/jira/browse/JSPWIKI-469">JSPWIKI-469</a> 053 */ 054public class TikaSearchProvider extends LuceneSearchProvider { 055 056 private static final Logger LOG = Logger.getLogger( TikaSearchProvider.class ); 057 AutoDetectParser parser; 058 Set< String > textualMetadataFields; 059 060 public TikaSearchProvider() { 061 parser = new AutoDetectParser(); 062 063 // metadata fields that also are indexed 064 textualMetadataFields = new HashSet<>(); 065 textualMetadataFields.add( TikaCoreProperties.TITLE.getName() ); 066 textualMetadataFields.add( TikaCoreProperties.COMMENTS.getName() ); 067 textualMetadataFields.add( TikaCoreProperties.KEYWORDS.getName() ); 068 textualMetadataFields.add( TikaCoreProperties.DESCRIPTION.getName() ); 069 textualMetadataFields.add( TikaCoreProperties.TYPE.getName() ); 070 textualMetadataFields.add( TikaMetadataKeys.RESOURCE_NAME_KEY ); 071 textualMetadataFields.add( PDF.DOC_INFO_TITLE.getName() ); 072 textualMetadataFields.add( PDF.DOC_INFO_KEY_WORDS.getName() ); 073 textualMetadataFields.add( PDF.DOC_INFO_SUBJECT.getName() ); 074 textualMetadataFields.add( OfficeOpenXMLCore.SUBJECT.getName() ); 075 textualMetadataFields.add( Office.KEYWORDS.getName() ); 076 textualMetadataFields.add( TikaCoreProperties.TYPE.getName() ); 077 textualMetadataFields.add( HttpHeaders.CONTENT_TYPE ); 078 textualMetadataFields.add( IPTC.HEADLINE.getName() ); 079 textualMetadataFields.add( Database.COLUMN_NAME.getName() ); 080 textualMetadataFields.add( Database.TABLE_NAME.getName() ); 081 textualMetadataFields.add( CreativeCommons.WORK_TYPE ); 082 textualMetadataFields.add( ClimateForcast.COMMENT ); 083 textualMetadataFields.add( ClimateForcast.HISTORY ); 084 textualMetadataFields.add( ClimateForcast.INSTITUTION ); 085 } 086 087 /** 088 * {@inheritDoc} 089 * @param att Attachment to get content for. Filename extension is used to determine the type of the attachment. 090 * @return String representing the content of the file. 091 */ 092 @Override 093 protected String getAttachmentContent( final Attachment att ) { 094 // LOG.debug("indexing "+att.getFileName()); 095 final AttachmentManager mgr = getEngine().getAttachmentManager(); 096 final StringBuilder out = new StringBuilder(); 097 098 try( final InputStream attStream = mgr.getAttachmentStream( att ) ) { 099 final Metadata metadata = new Metadata(); 100 metadata.set( TikaMetadataKeys.RESOURCE_NAME_KEY, att.getFileName() ); 101 102 final ContentHandler handler = new BodyContentHandler(-1 ); 103 // -1 disables the character size limit; otherwise only the first 100.000 characters are indexed 104 105 parser.parse( attStream, handler, metadata ); 106 out.append( handler.toString() ); 107 108 final String[] names = metadata.names(); 109 for( int j = 0; j < names.length; j++ ) { 110 if( textualMetadataFields.contains( names[ j ] ) ) { 111 out.append( " " ).append( metadata.get( names[ j ] ) ); 112 } 113 } 114 } catch( TikaException | SAXException e ) { 115 LOG.error( "Attachment cannot be parsed", e ); 116 } catch( ProviderException | IOException e ) { 117 LOG.error( "Attachment cannot be loaded", e ); 118 } 119 120 return out.toString(); 121 } 122 123}