+1-888-365-2779
Try Now
More in this section

Forums / Developing with Sitefinity / Searching/Indexing additional document/mime types

Searching/Indexing additional document/mime types

2 posts, 0 answered
  1. Christopher
    Christopher avatar
    2 posts
    Registered:
    01 Feb 2013
    31 Mar 2013
    Link to this post
    Hi Everyone:

    I have been investigating whether SiteFinity can be extended to extract content from additional document/mime types, e.g. power point documents, excel files, etc.  I have decompiled the SiteFinity libraries and I have found the following class that is responsible for extraction of text (see below).  There seems to be also some configuration elements that register the list of available mime types (I cannot seemt to find the actual config file). 

    The class itself is internal so I cannot even call it or reference it from my own code, so that's not going to work.  The class is called by the DocumentService which is also internal.

    Is there a prescribed method for adding an additional ITextExtractor to be used for pulling out text from files - I could easily write one but I have no way to hook it into the default DocumentService?

    Suggestions?

    internal class DefaultTextExtractor : ITextExtractor
        {
            private static object syncLock;        private string mimeType;        public string MimeType
            {
                get
                {
                    return this.mimeType;
                }
            }        static DefaultTextExtractor()
            {
                DefaultTextExtractor.syncLock = new object();
            }        public DefaultTextExtractor()
            {
            }        private static void ExecuteDocumentAction(Action action)
            {
                lock (DefaultTextExtractor.syncLock)
                {
                    Thread thread = new Thread(() => {
                        try
                        {
                            action();
                        }
                        catch (Exception exception1)
                        {
                            Exception exception = exception1;
                            exception = new Exception("Error extracting the text content of a document", exception);
                            Log.Write(exception, ConfigurationPolicy.ErrorLog);
                        }
                    });
                    thread.SetApartmentState(ApartmentState.STA);
                    thread.Start();
                    thread.Join();
                }
            }        private ITextExtractor GetInnerTextExtractor()
            {
                string str = this.mimeType;
                string str1 = str;
                if (str != null)
                {
                    if (str1 == "text/rtf")
                    {
                        return new DefaultTextExtractor.DefaultRtfTextExtractor();
                    }
                    else
                    {
                        if (str1 == "text/html")
                        {
                            return new DefaultTextExtractor.DefaultHtmlTextExtractor();
                        }
                        else
                        {
                            if (str1 == "text/plain")
                            {
                                return new DefaultTextExtractor.DefaultTxtTextExtractor();
                            }
                            else
                            {
                                if (str1 == "application/vnd.openxmlformats-officedocument.wordprocessingml.document")
                                {
                                    return new DefaultTextExtractor.DefaultDocxTextExtractor();
                                }
                                else
                                {
                                    if (str1 == "application/pdf")
                                    {
                                        return new DefaultTextExtractor.DefaultPdfTextExtractor();
                                    }
                                }
                            }
                        }
                    }
                }
                return null;
            }        public void GetText(Stream doc, Stream text)
            {
                ITextExtractor innerTextExtractor = this.GetInnerTextExtractor();
                if (innerTextExtractor != null)
                {
                    innerTextExtractor.GetText(doc, text);
                    return;
                }
                else
                {
                    throw new InvalidOperationException(string.Format("The MIME type '{0}' is not supported by DefaultTextExtractor.", this.mimeType));
                }
            }        public void Initialize(string mimeType, NameValueCollection config)
            {
                if (!string.IsNullOrEmpty(mimeType))
                {
                    this.mimeType = mimeType;
                    return;
                }
                else
                {
                    throw new ArgumentException("The default text extractor needs a MIME type.");
                }
            }        private class DefaultDocxTextExtractor : ITextExtractor
            {
                private string JustDecompileGenerated_MimeType_k__BackingField;            public string JustDecompileGenerated_get_MimeType()
                {
                    return this.JustDecompileGenerated_MimeType_k__BackingField;
                }            private void JustDecompileGenerated_set_MimeType(string value)
                {
                    this.JustDecompileGenerated_MimeType_k__BackingField = value;
                }            public string MimeType
                {
                    get
                    {
                        return JustDecompileGenerated_get_MimeType();
                    }
                    set
                    {
                        JustDecompileGenerated_set_MimeType(value);
                    }
                }            public DefaultDocxTextExtractor()
                {
                }            public void GetText(Stream doc, Stream text)
                {
                    DefaultTextExtractor.ExecuteDocumentAction(() => {
                        byte[] numArray;
                        using (BinaryReader binaryReader = new BinaryReader(doc))
                        {
                            numArray = binaryReader.ReadBytes((int)doc.Length);
                        }
                        RadDocument radDocument = (new DocxFormatProvider()).Import(numArray);
                        (new TxtFormatProvider()).Export(radDocument, text);
                    });
                }            public void Initialize(string mimeType, NameValueCollection config)
                {
                    this.MimeType = "application/vnd.openxmlformats-officedocument.wordprocessingml.document";
                }
            }        private class DefaultHtmlTextExtractor : ITextExtractor
            {
                private string JustDecompileGenerated_MimeType_k__BackingField;            public string JustDecompileGenerated_get_MimeType()
                {
                    return this.JustDecompileGenerated_MimeType_k__BackingField;
                }            private void JustDecompileGenerated_set_MimeType(string value)
                {
                    this.JustDecompileGenerated_MimeType_k__BackingField = value;
                }            public string MimeType
                {
                    get
                    {
                        return JustDecompileGenerated_get_MimeType();
                    }
                    set
                    {
                        JustDecompileGenerated_set_MimeType(value);
                    }
                }            public DefaultHtmlTextExtractor()
                {
                }            public void GetText(Stream doc, Stream text)
                {
                    DefaultTextExtractor.ExecuteDocumentAction(() => {
                        RadDocument radDocument = (new HtmlFormatProvider()).Import(doc);
                        (new TxtFormatProvider()).Export(radDocument, text);
                    });
                }            public void Initialize(string mimeType, NameValueCollection config)
                {
                    this.MimeType = "text/html";
                }
            }        private class DefaultPdfTextExtractor : ITextExtractor
            {
                private string JustDecompileGenerated_MimeType_k__BackingField;            public string JustDecompileGenerated_get_MimeType()
                {
                    return this.JustDecompileGenerated_MimeType_k__BackingField;
                }            private void JustDecompileGenerated_set_MimeType(string value)
                {
                    this.JustDecompileGenerated_MimeType_k__BackingField = value;
                }            public string MimeType
                {
                    get
                    {
                        return JustDecompileGenerated_get_MimeType();
                    }
                    set
                    {
                        JustDecompileGenerated_set_MimeType(value);
                    }
                }            public DefaultPdfTextExtractor()
                {
                }            public void GetText(Stream doc, Stream text)
                {
                    DefaultTextExtractor.ExecuteDocumentAction(() => {
                        byte[] numArray;
                        using (BinaryReader binaryReader = new BinaryReader(doc))
                        {
                            numArray = binaryReader.ReadBytes((int)doc.Length);
                        }
                        MemoryStream memoryStream = new MemoryStream();
                        memoryStream.Write(numArray, 0, (int)numArray.Length);
                        RadFixedDocument radFixedDocument = (new PdfFormatProvider(memoryStream, FormatProviderSettings.ReadOnDemand)).Import();
                        TextFormatProvider textFormatProvider = new TextFormatProvider();
                        TextFormatProviderSettings textFormatProviderSetting = new TextFormatProviderSettings("\r\n", string.Empty);
                        string str = textFormatProvider.Export(radFixedDocument, textFormatProviderSetting);
                        StreamWriter streamWriter = new StreamWriter(text, Encoding.UTF8);
                        streamWriter.Write(str);
                        streamWriter.Flush();
                    });
                }            public void Initialize(string mimeType, NameValueCollection config)
                {
                    this.MimeType = "application/pdf";
                }
            }        private class DefaultRtfTextExtractor : ITextExtractor
            {
                private string JustDecompileGenerated_MimeType_k__BackingField;            public string JustDecompileGenerated_get_MimeType()
                {
                    return this.JustDecompileGenerated_MimeType_k__BackingField;
                }            private void JustDecompileGenerated_set_MimeType(string value)
                {
                    this.JustDecompileGenerated_MimeType_k__BackingField = value;
                }            public string MimeType
                {
                    get
                    {
                        return JustDecompileGenerated_get_MimeType();
                    }
                    set
                    {
                        JustDecompileGenerated_set_MimeType(value);
                    }
                }            public DefaultRtfTextExtractor()
                {
                }            public void GetText(Stream doc, Stream text)
                {
                    DefaultTextExtractor.ExecuteDocumentAction(() => {
                        RadDocument radDocument = (new RtfFormatProvider()).Import(doc);
                        (new TxtFormatProvider()).Export(radDocument, text);
                    });
                }            public void Initialize(string mimeType, NameValueCollection config)
                {
                    this.MimeType = "text/rtf";
                }
            }        private class DefaultTxtTextExtractor : ITextExtractor
            {
                private string JustDecompileGenerated_MimeType_k__BackingField;            public string JustDecompileGenerated_get_MimeType()
                {
                    return this.JustDecompileGenerated_MimeType_k__BackingField;
                }            private void JustDecompileGenerated_set_MimeType(string value)
                {
                    this.JustDecompileGenerated_MimeType_k__BackingField = value;
                }            public string MimeType
                {
                    get
                    {
                        return JustDecompileGenerated_get_MimeType();
                    }
                    set
                    {
                        JustDecompileGenerated_set_MimeType(value);
                    }
                }            public DefaultTxtTextExtractor()
                {
                }            public void GetText(Stream doc, Stream text)
                {
                    DefaultTextExtractor.ExecuteDocumentAction(() => {
                        RadDocument radDocument = (new TxtFormatProvider()).Import(doc);
                        (new TxtFormatProvider()).Export(radDocument, text);
                    });
                }            public void Initialize(string mimeType, NameValueCollection config)
                {
                    this.MimeType = "text/txt";
                }
            }
        }
  2. Kurren
    Kurren avatar
    23 posts
    Registered:
    08 Jun 2013
    01 Apr 2014
    Link to this post

    Any suggestions on this? I need to specify mime types for SVG images.

2 posts, 0 answered