% This file was created with JabRef 2.9b2. % Encoding: Cp1252 @INPROCEEDINGS{Brin1998, author = {Brin, Sergey and Page, Lawrence}, title = {The Anatomy of a Large-Scale Hypertextual Web Search Engine}, booktitle = {Seventh World Wide Web Conference}, year = {1998}, keywords = {World Wide Web, Search Engines, Information Retrieval, PageRank, Google}, owner = {jim}, quality = {1}, timestamp = {2013.10.29} } @CONFERENCE{Clark2004, author = {Clark, Stephen and Curran, James R.}, title = {Parsing the {WSJ} using {CCG} and Log-Linear Models}, booktitle = {Proceedings of the 42nd Annual Meeting of the Association for Computational Linguistics}, year = {2004}, pages = {104-111}, owner = {jim}, quality = {1}, timestamp = {2013.10.29} } @INBOOK{Jurafsky2009, chapter = {18}, pages = {617--644}, title = {Speech and Language Processing}, publisher = {Pearson}, year = {2009}, author = {Jurafsky, Daniel and Martin, James H.}, series = {Prentice-Hall series in artificial intelligence}, edition = {Second}, abstract = {Sentences get their meanings from the words they contain and the syntactic order of the words. Therefore the meaning of a sentence is partially based on the words and its syntactic structure. The composition of meaning representation is guided by the syntactic components and relations provided by grammars such as CFGs. A meaning representation is generated by first sending the input through a parser which results in the syntactic analysis and second passing this analysis as input to a semantic analyzer. In the syntax-driven semantic analysis it is assumed that syntactic, lexical and anaphoric ambiguities are not a problem. The semantic meanings are attached to the grammar rules and lexical entries from which trees are generated in the first place. This is called rule-to-rule hypothesis. The semantic attachments are written in braces after the syntactic rules themselves. After the syntactic analysis has been created, every word receives a FOL predicate and/or term. The semantic analyzer goes the tree up until the complete FOL term has been created. On the way lambda reduction is used to replace predicates and terms with their proper meanings, received from other parts of the tree.}, booktitle = {Speech and Language Processing}, owner = {jim}, quality = {1}, timestamp = {2013.11.16} } @INBOOK{Jurafsky2009a, chapter = {17}, pages = {579--616}, title = {Speech and Language Processing}, publisher = {Pearson}, year = {2009}, author = {Jurafsky, Daniel and Martin, James H.}, series = {Prentice-Hall series in artificial intelligence}, edition = {Second}, abstract = {Lambda notation is used to bind variables dynamically to later appearing contents. lambda x P(x)(y) results in P(y) after a lambda reduction as x has been bound to y. lambda P P(x)(lambda x Restaurant(x)) results in lambda x Restaurant(x)(x) which results in Restaurant(x)}, booktitle = {Speech and Language Processing}, owner = {jim}, quality = {1}, timestamp = {2013.11.16} } @INBOOK{Jurafsky2009b, chapter = {13}, pages = {461--492}, title = {Speech and Language Processing}, publisher = {Pearson}, year = {2009}, author = {Jurafsky, Daniel and Martin, James H.}, series = {Prentice-Hall series in artificial intelligence}, edition = {Second}, owner = {jim}, quality = {1}, timestamp = {2013.11.17} } @CONFERENCE{Kessler1997, author = {Kessler, Brett and Nunberg, Geoffrey and Schuetze, Hinrich}, title = {Automatic Detection of Text Genre}, booktitle = {Proceedings of the 35th Annual Meeting of the Association for Computational Linguistics}, year = {1997}, pages = {32-38}, owner = {jim}, quality = {1}, timestamp = {2013.10.29} } @CONFERENCE{Klein2003, author = {Klein, Dan and Smarr, Joseph and Nguyen, Huy and Manning, Christopher D.}, title = {Named Entity Recognition with Character-Level Models}, booktitle = {Conference on Natural Learning (CoNLL)}, year = {2003}, pages = {180-183}, owner = {jim}, quality = {1}, timestamp = {2013.10.29} } @TECHREPORT{Paskin2001, author = {Paskin, Mark A.}, title = {Cubic-time Parsing and Learning Algorithms for Grammatical Bigram Models}, institution = {University of California}, year = {2001}, number = {UCB/CSD-01-1148}, month = {June}, abstract = {In Dependency Grammar there are head words and dependents. Each phrase has only one head word. The head word determines how all of its dependents may be syntactically combined with other words to form a sentence. A head word and all of its dependents form a constituent. In every sentence there may be one or more dependency relationships with one head word each. Dependents that precede their head are called predependents and dependents that follow their head are called postdependents. A dependency parse consists of a set of dependency relationships that satisfies three constraints: 1. Every word except one (the root) is dependent to exactly one head. 2. The dependency relationships are acyclic; no word is, through a sequence of dependency relationships, dependent to itself. 3. When drawn as a graph above the sentence, no two dependency relations cross - a property known as projectivity or planarity. The Grammatical Bigram Probability Model assumes that all the dependents of a head word are independent of one another and their relative order. This is a strong approximation as in full English there are argument structure constraints that rely on the order of dependents. This simplification allows for a reduced computational complexity for parsing and learning. The grammar model falls into the class of "Bilexical grammars". A dependency parse consists of multiple spans. A span has at least two words up to n words. Spans have one property: No word in the span has a parent outside the span. Spans can be joined and closed. To join the span one of them has to be connected (both end words are connected with an edge) and both spans have to share one endword. The new span will be connected if both subspans were connected. If that is not the case, it can be closed by adding an edge between the endwords of the new span. Every dependency parse has a unique span decomposition. For joining the left subspan has be simple. That means it has to have an edge between its endwords or consist of two words only. Relying on this ensures that each span is derived only once. Every span has a signature. This signature states the indexes of its endwords, if it is simple and whether the left or right endword have parents within the span. Spans where both the left and right endword have the parent within the string are called toplevel signatures as such signatures characterize valid parses. Parser operations take signatures as input rather than spans. They produce signatures as well. SEED creates an unconnected and simple span with two adjacent words. CLOSE-LEFT adds an edge between the endwords and makes the left endword the parent of the right one. CLOSE-RIGHT does the opposite and makes the right endword the parent of the left one. These operators require that neither the left nor the right endword have a parent within the span. JOIN takes two input spans and joins them. It requires that the spans share an endword (1.), the shared endword has one parent (2.) and the left input is simple (3.). The JOIN rule applies only if the left span doesn't start the sentence. These operators constitute an algebra over span signatures called span signature algebra. A derivation D is an expression in this algebra. Like operations it evaluates to span signatures. These expressions can be represented as trees where the nodes are operations. There is an isomorphism between dependency parses and their corresponding derivations. Optimal derivation must consist of an operation over the results of optimal sub-derivations. Therefore it is enough to record the parse operation with the most likely derivation of a given signature in order to reconstruct the most likely derivation of the entire sentence. The chart-parse algorithm returns the optimal parse. It uses a subprocedure called EXTRACT-OPT-PARSE that constructs the optimal parse by finding the top-level signature (sigma) with maximum optimal probability (pi*). It backtracks then recursively through the optimal derivation defined by (omega*). If CLOSE operations are encountered edges are recorded in the parse. The algorithm requires O(n³) time and O(n²) space.}, owner = {jim}, quality = {1}, timestamp = {2013.10.29} } @INBOOK{Russel2010, chapter = {23}, pages = {888--927}, title = {Artificial intelligence: A Modern Approach}, publisher = {Pearson}, year = {2009}, author = {Russel, Stuart J. and Norvig, Peter}, series = {Prentice-Hall series in artificial intelligence}, edition = {Third}, abstract = {The first method to understanding natural language is syntactic analysis or parsing. The goal is to find the phrase structure of a sequence of words according to the rules of the applied grammar. A strict top-to-bottom or bottom-to-top parsing can be inefficient. Given two sentences with the same first 10 words and a difference only from the 11th word on, parsing from left-to-right would force the parser to make a guess about the nature of the sentence. But it doesn't know if it's right until the 11th word. From there it had to backtrack and reanalyze the sentence. To prevent that dynamic programming is used. Every analyzed substring gets stored for later. Once it is discovered that for example "the students in section 2 of Computer Science 101" is a noun phrase, this information can be stored in a structure known as chart. Algorithms that do such storing are called chart parsers. One of this chart parsers is a bottom-up version called CYK algorithm after its inventors John Cocke, Daniel Younger and Tadeo Kasami. This algorithm requires a grammar in the Chomsky Normal Form. The algorithm takes O(n²m) space for the P table with n being the number of words in the sentence and m the number of nonterminal symbols in the grammar. It takes O(n³m) time whereas m is constant for a particular grammar. That's why it is commonly described as O(n³). There is no faster algorithm for general context-free grammars. The CYK algorithm only co mputes the probability of the most probable tree. The subtrees are all represented in P table. PCFGs (Probabilistic context free grammars) have many rules with a probability for each one of them. Learning the grammar from data is better than a knowledge engineering approach. Learning is easiest if we are given a corpus of correctly parsed sentences; commonly known as a treebank. The best known treebank is the Penn Treebank as it consists of 3 million words which have been annotated with part of speech and parse-tree structure. Given an amount of trees, a PCFG can be created just by counting and smoothing. If no treebank is given it is still possible to learn the grammar but it is more difficult. In such a case there are actually two problems: First learning the structure of the grammar rules and second learning the probabilities associated with them. PCFGs have the problem that they are context-free. Combining a PCFG and Markov model will get the best of both. This leads ultimately to lexicalized PCFGs. But another problem of PCFGs is there preference for short sentences. Lexicalized PCFGs introduce so called head words. Such words are the most important words in a phrase and the probabilities are calculated between the head words. Example: "eat a banana" "eat" is the head of the verb phrase "eat a banana", whereas "banana" is the head of the noun phrase "a banana". Probability P1 now depends on "eat" and "banana" and the result would be very high. If the head of the noun phrase were "bandanna", the result would be significantly lower. The next step are definite clause grammars. They can be used to parse in a way of logical inference and makes it possible to reason about languages and strings in many different ways. Furthermore augmentations allow for distinctions in a single subphrase. For example the noun phrase (NP) depends on the subject case and the person and number of persons. A real world example would be "to smell". It is "I smell", "you smell", "we smell", "you smell" and "they smell" but "he/she/it smells". It depends on the person what version is taken. Semantic interpretation is used to give sentences a meaning. This is achieved through logical sentences. The semantics can be added to an already augmented grammar (created during the previous step), resulting in multiple augmentations at the same time. Chill is an inductive logic programming program that can learn to achieve 70% to 85% accuracy on various database query tasks. But there are several complications as English is endlessly complex. First there is the time at which things happened (present, past, future). Second you have the so called speech act which is the speaker's action that has to be deciphered by the hearer. The hearer has to find out what type of action it is (a statement, a question, an order, a warning, a promise and so on). Then there are so called long-distance dependencies and ambiguity. The ambiguity can reach from lexical ambiguity where a word has multiple usages, over syntactic ambiguity where a sentence has multiple parses up to semantic ambiguity where the meaning of and the same sentence can be different. Last there is ambiguity between literal meaning and figurative meanings. Finally there are four models that need to be combined to do disambiguation properly: the world model, the mental model, the language model and the acoustic model. -- not so much an abstract of the specific content of that section as an abstract about speech recognition in general -- The second method is speech recognition. It has the added difficulty that the words are not clearly separated and every speaker can pronounce the same sentence with the same meaning different. An example is "The train is approaching". Another written form would be "The train's approaching". Both convey the same meaning in the written language. But if a BBC, a CNN and a german news anchor speeks this sentence it will sound dramatically different. Speech recognition has to deal with that problem to get the written text associated with the spoken words. From the text the first method can than be used to analyze the words and find a meaning. Finally this meaning can be used to create some kind of action in a dialog system. -- Some problems of speech recognition are segmentation, coarticulation and homophones. Two used models are the acoustic model and the language model. Another major model is the noisy channel model, named after Claude Shannon (1948). He showed that the original message can always be recovered in a noisy channel if the original message is encoded in a redundant enough way. The acoustic model in particular is used to get to the really interesting parts. It is not interesting how words were spoken but more what words where spoken. That means that not all available information needs to be stored and a relative low sample rate is enough. 80 samples at 8kHz with a frame length of about 10 milliseconds is enough for that matter. To distinguish words so called phones are used. There are 49 phones used in English. A phoneme is the smallest unit of sound that has a distinct meaning to speakers of a particular language. Back to the frames: every frame is summarized by a vector of features. Features are important aspects of a speech signal. It can be compared to listening to an orchestra and saying "here the French horns are playing loudly and the violins are playing softly". Yet another difficulty are dialect variations. The language model should be learned from a corpus of transcripts of spoken language. But such a thing is more difficult than building an n-gram model of text, because it requires a hidden Markov model. All in all speech recognition is most effective when used for a specific task against a restricted set of options. A general purpose system can only work accurately if it creates one model for every speaker. Prominent examples like Apple's siri are therefore not very accurate.}, bookauthor = {Russel, Stuart J. and Norvig, Peter}, booktitle = {Artificial intelligence: A Modern Approach}, date = {December 11}, owner = {jim}, timestamp = {2013.10.24} } @INPROCEEDINGS{Sleator1993, author = {Sleator, Daniel D. K. and Temperley, Davy}, title = {Parsing English with a Link Grammar}, booktitle = {Third Annual Workshop on Parsing technologies}, year = {1993}, owner = {jim}, quality = {1}, timestamp = {2013.10.29} } @CONFERENCE{Smith2008, author = {Smith, David A. and Eisner, Jason}, title = {Dependency Parsing by Belief Propagation}, booktitle = {Conference on Empirical Methods in Natural Language Processing}, year = {2008}, pages = {145-156}, date = {October 25 - October 27}, owner = {jim}, quality = {1}, timestamp = {2013.10.29} }