Log in Help
Print
Homegatewikinutch-solrnutch-config 〉 urlfilter-default.txt
 
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


# The url filter file used by the crawl command.

# Better for intranet crawling.
# Be sure to change MY.DOMAIN.NAME to your domain name.

# Each non-comment, non-blank line contains a regular expression
# prefixed by '+' or '-'.  The first matching pattern in the file
# determines whether a URL is included or ignored.  If no pattern
# matches, the URL is ignored.

# skip ftp:, mailto:, http: & https: urls
-^(ftp|mailto|http|https):

# exclude everything that is inside svn folders
-^.*\.svn

# skip image and other suffixes we can't yet parse
# there are several other we can't parse, but need to find out which are those
-\.(gif|GIF|jpg|JPG|png|PNG|ico|ICO|css|sit|wmf|zip|ppt|mpg|gz|rpm|tgz|mov|MOV|exe|jpeg|JPEG|bmp|BMP|svn|svn-base|yam|avi|wma|mp3|lst|tar|jar|ipr|java|class|tree|war|eps|EPS|job|xml)$

# skip URLs containing certain characters as probable queries, etc.
-[?*!@=]

# skip URLs with slash-delimited segment that repeats 3+ times, to break loops
-.*(/[^/]+)/[^/]+\1/[^/]+\1/

#exclude urls with spaces in them
#-^.*([ !"]+)