mirror of https://github.com/2martens/uni.git
340 lines
14 KiB
BibTeX
Executable File
340 lines
14 KiB
BibTeX
Executable File
% This file was created with JabRef 2.9b2.
|
||
% Encoding: Cp1252
|
||
|
||
@INPROCEEDINGS{Brin1998,
|
||
author = {Brin, Sergey and Page, Lawrence},
|
||
title = {The Anatomy of a Large-Scale Hypertextual Web Search Engine},
|
||
booktitle = {Seventh World Wide Web Conference},
|
||
year = {1998},
|
||
keywords = {World Wide Web, Search Engines, Information Retrieval, PageRank, Google},
|
||
owner = {jim},
|
||
quality = {1},
|
||
timestamp = {2013.10.29}
|
||
}
|
||
|
||
@CONFERENCE{Clark2004,
|
||
author = {Clark, Stephen and Curran, James R.},
|
||
title = {Parsing the {WSJ} using {CCG} and Log-Linear Models},
|
||
booktitle = {Proceedings of the 42nd Annual Meeting of the Association for Computational
|
||
Linguistics},
|
||
year = {2004},
|
||
pages = {104-111},
|
||
owner = {jim},
|
||
quality = {1},
|
||
timestamp = {2013.10.29}
|
||
}
|
||
|
||
@CONFERENCE{Kessler1997,
|
||
author = {Kessler, Brett and Nunberg, Geoffrey and Schuetze, Hinrich},
|
||
title = {Automatic Detection of Text Genre},
|
||
booktitle = {Proceedings of the 35th Annual Meeting of the Association for Computational
|
||
Linguistics},
|
||
year = {1997},
|
||
pages = {32-38},
|
||
owner = {jim},
|
||
quality = {1},
|
||
timestamp = {2013.10.29}
|
||
}
|
||
|
||
@CONFERENCE{Klein2003,
|
||
author = {Klein, Dan and Smarr, Joseph and Nguyen, Huy and Manning, Christopher
|
||
D.},
|
||
title = {Named Entity Recognition with Character-Level Models},
|
||
booktitle = {Conference on Natural Learning (CoNLL)},
|
||
year = {2003},
|
||
pages = {180-183},
|
||
owner = {jim},
|
||
quality = {1},
|
||
timestamp = {2013.10.29}
|
||
}
|
||
|
||
@TECHREPORT{Paskin2001,
|
||
author = {Paskin, Mark A.},
|
||
title = {Cubic-time Parsing and Learning Algorithms for Grammatical Bigram
|
||
Models},
|
||
institution = {University of California},
|
||
year = {2001},
|
||
number = {UCB/CSD-01-1148},
|
||
month = {June},
|
||
abstract = {In Dependency Grammar there are head words and dependents. Each phrase
|
||
has only one head word. The head word determines how all of its dependents
|
||
may be syntactically combined with other words to form a sentence.
|
||
A head word and all of its dependents form a constituent. In every
|
||
sentence there may be one or more dependency relationships with one
|
||
head word each.
|
||
|
||
Dependents that precede their head are called predependents and dependents
|
||
that follow their head are called postdependents.
|
||
|
||
|
||
A dependency parse consists of a set of dependency relationships that
|
||
satisfies three constraints: 1. Every word except one (the root)
|
||
is dependent to exactly one head. 2. The dependency relationships
|
||
are acyclic; no word is, through a sequence of dependency relationships,
|
||
dependent to itself. 3. When drawn as a graph above the sentence,
|
||
no two dependency relations cross - a property known as projectivity
|
||
or planarity.
|
||
|
||
|
||
The Grammatical Bigram Probability Model assumes that all the dependents
|
||
of a head word are independent of one another and their relative
|
||
order. This is a strong approximation as in full English there are
|
||
argument structure constraints that rely on the order of dependents.
|
||
This simplification allows for a reduced computational complexity
|
||
for parsing and learning. The grammar model falls into the class
|
||
of "Bilexical grammars".
|
||
|
||
|
||
A dependency parse consists of multiple spans. A span has at least
|
||
two words up to n words. Spans have one property: No word in the
|
||
span has a parent outside the span. Spans can be joined and closed.
|
||
To join the span one of them has to be connected (both end words
|
||
are connected with an edge) and both spans have to share one endword.
|
||
The new span will be connected if both subspans were connected. If
|
||
that is not the case, it can be closed by adding an edge between
|
||
the endwords of the new span.
|
||
|
||
|
||
Every dependency parse has a unique span decomposition. For joining
|
||
the left subspan has be simple. That means it has to have an edge
|
||
between its endwords or consist of two words only. Relying on this
|
||
ensures that each span is derived only once.
|
||
|
||
|
||
Every span has a signature. This signature states the indexes of its
|
||
endwords, if it is simple and whether the left or right endword have
|
||
parents within the span. Spans where both the left and right endword
|
||
have the parent within the string are called toplevel signatures
|
||
as such signatures characterize valid parses.
|
||
|
||
|
||
Parser operations take signatures as input rather than spans. They
|
||
produce signatures as well. SEED creates an unconnected and simple
|
||
span with two adjacent words. CLOSE-LEFT adds an edge between the
|
||
endwords and makes the left endword the parent of the right one.
|
||
CLOSE-RIGHT does the opposite and makes the right endword the parent
|
||
of the left one. These operators require that neither the left nor
|
||
the right endword have a parent within the span.
|
||
|
||
|
||
JOIN takes two input spans and joins them. It requires that the spans
|
||
share an endword (1.), the shared endword has one parent (2.) and
|
||
the left input is simple (3.). The JOIN rule applies only if the
|
||
left span doesn't start the sentence.
|
||
|
||
|
||
These operators constitute an algebra over span signatures called
|
||
span signature algebra. A derivation D is an expression in this algebra.
|
||
Like operations it evaluates to span signatures. These expressions
|
||
can be represented as trees where the nodes are operations. There
|
||
is an isomorphism between dependency parses and their corresponding
|
||
derivations.
|
||
|
||
|
||
Optimal derivation must consist of an operation over the results of
|
||
optimal sub-derivations. Therefore it is enough to record the parse
|
||
operation with the most likely derivation of a given signature in
|
||
order to reconstruct the most likely derivation of the entire sentence.
|
||
|
||
|
||
The chart-parse algorithm returns the optimal parse. It uses a subprocedure
|
||
called EXTRACT-OPT-PARSE that constructs the optimal parse by finding
|
||
the top-level signature (sigma) with maximum optimal probability
|
||
(pi*). It backtracks then recursively through the optimal derivation
|
||
defined by (omega*). If CLOSE operations are encountered edges are
|
||
recorded in the parse. The algorithm requires O(n<>) time and O(n<>)
|
||
space.},
|
||
owner = {jim},
|
||
quality = {1},
|
||
timestamp = {2013.10.29}
|
||
}
|
||
|
||
@INBOOK{Russel2010,
|
||
author = {Russel, Stuart J. and Norvig, Peter},
|
||
title = {Artificial intelligence: A Modern Approach},
|
||
booktitle = {Artificial intelligence: A Modern Approach},
|
||
year = {2009},
|
||
date = {December 11},
|
||
bookauthor = {Russel, Stuart J. and Norvig, Peter},
|
||
edition = {Third},
|
||
series = {Prentice-Hall series in artificial intelligence},
|
||
publisher = {Prentice Hall},
|
||
chapter = {23},
|
||
pages = {888-927},
|
||
abstract = {The first method to understanding natural language is syntactic analysis
|
||
or parsing. The goal is to find the phrase structure of a sequence
|
||
of words according to the rules of the applied grammar.
|
||
|
||
A strict top-to-bottom or bottom-to-top parsing can be inefficient.
|
||
Given two sentences with the same first 10 words and a difference
|
||
only from the 11th word on, parsing from left-to-right would force
|
||
the parser to make a guess about the nature of the sentence. But
|
||
it doesn't know if it's right until the 11th word. From there it
|
||
had to backtrack and reanalyze the sentence.
|
||
|
||
|
||
To prevent that dynamic programming is used. Every analyzed substring
|
||
gets stored for later. Once it is discovered that for example "the
|
||
students in section 2 of Computer Science 101" is a noun phrase,
|
||
this information can be stored in a structure known as chart. Algorithms
|
||
that do such storing are called chart parsers. One of this chart
|
||
parsers is a bottom-up version called CYK algorithm after its inventors
|
||
John Cocke, Daniel Younger and Tadeo Kasami. This algorithm requires
|
||
a grammar in the Chomsky Normal Form. The algorithm takes O(n<>m)
|
||
space for the P table with n being the number of words in the sentence
|
||
and m the number of nonterminal symbols in the grammar. It takes
|
||
O(n<>m) time whereas m is constant for a particular grammar. That's
|
||
why it is commonly described as O(n<>). There is no faster algorithm
|
||
for general context-free grammars.
|
||
|
||
|
||
The CYK algorithm only co mputes the probability of the most probable
|
||
tree. The subtrees are all represented in P table.
|
||
|
||
|
||
PCFGs (Probabilistic context free grammars) have many rules with a
|
||
probability for each one of them. Learning the grammar from data
|
||
is better than a knowledge engineering approach. Learning is easiest
|
||
if we are given a corpus of correctly parsed sentences; commonly
|
||
known as a treebank. The best known treebank is the Penn Treebank
|
||
as it consists of 3 million words which have been annotated with
|
||
part of speech and parse-tree structure. Given an amount of trees,
|
||
a PCFG can be created just by counting and smoothing.
|
||
|
||
|
||
If no treebank is given it is still possible to learn the grammar
|
||
but it is more difficult. In such a case there are actually two problems:
|
||
First learning the structure of the grammar rules and second learning
|
||
the probabilities associated with them.
|
||
|
||
|
||
PCFGs have the problem that they are context-free. Combining a PCFG
|
||
and Markov model will get the best of both. This leads ultimately
|
||
to lexicalized PCFGs. But another problem of PCFGs is there preference
|
||
for short sentences.
|
||
|
||
|
||
Lexicalized PCFGs introduce so called head words. Such words are the
|
||
most important words in a phrase and the probabilities are calculated
|
||
between the head words. Example: "eat a banana" "eat" is the head
|
||
of the verb phrase "eat a banana", whereas "banana" is the head of
|
||
the noun phrase "a banana". Probability P1 now depends on "eat" and
|
||
"banana" and the result would be very high. If the head of the noun
|
||
phrase were "bandanna", the result would be significantly lower.
|
||
|
||
|
||
The next step are definite clause grammars. They can be used to parse
|
||
in a way of logical inference and makes it possible to reason about
|
||
languages and strings in many different ways. Furthermore augmentations
|
||
allow for distinctions in a single subphrase. For example the noun
|
||
phrase (NP) depends on the subject case and the person and number
|
||
of persons. A real world example would be "to smell". It is "I smell",
|
||
"you smell", "we smell", "you smell" and "they smell" but "he/she/it
|
||
smells". It depends on the person what version is taken.
|
||
|
||
|
||
Semantic interpretation is used to give sentences a meaning. This
|
||
is achieved through logical sentences. The semantics can be added
|
||
to an already augmented grammar (created during the previous step),
|
||
resulting in multiple augmentations at the same time. Chill is an
|
||
inductive logic programming program that can learn to achieve 70%
|
||
to 85% accuracy on various database query tasks.
|
||
|
||
|
||
But there are several complications as English is endlessly complex.
|
||
First there is the time at which things happened (present, past,
|
||
future). Second you have the so called speech act which is the speaker's
|
||
action that has to be deciphered by the hearer. The hearer has to
|
||
find out what type of action it is (a statement, a question, an order,
|
||
a warning, a promise and so on). Then there are so called long-distance
|
||
dependencies and ambiguity. The ambiguity can reach from lexical
|
||
ambiguity where a word has multiple usages, over syntactic ambiguity
|
||
where a sentence has multiple parses up to semantic ambiguity where
|
||
the meaning of and the same sentence can be different. Last there
|
||
is ambiguity between literal meaning and figurative meanings.
|
||
|
||
|
||
Finally there are four models that need to be combined to do disambiguation
|
||
properly: the world model, the mental model, the language model and
|
||
the acoustic model.
|
||
|
||
|
||
-- not so much an abstract of the specific content of that section
|
||
as an abstract about speech recognition in general --
|
||
|
||
|
||
The second method is speech recognition. It has the added difficulty
|
||
that the words are not clearly separated and every speaker can pronounce
|
||
the same sentence with the same meaning different. An example is
|
||
"The train is approaching". Another written form would be "The train's
|
||
approaching". Both convey the same meaning in the written language.
|
||
But if a BBC, a CNN and a german news anchor speeks this sentence
|
||
it will sound dramatically different. Speech recognition has to deal
|
||
with that problem to get the written text associated with the spoken
|
||
words. From the text the first method can than be used to analyze
|
||
the words and find a meaning. Finally this meaning can be used to
|
||
create some kind of action in a dialog system.
|
||
|
||
|
||
--
|
||
|
||
|
||
Some problems of speech recognition are segmentation, coarticulation
|
||
and homophones. Two used models are the acoustic model and the language
|
||
model. Another major model is the noisy channel model, named after
|
||
Claude Shannon (1948). He showed that the original message can always
|
||
be recovered in a noisy channel if the original message is encoded
|
||
in a redundant enough way.
|
||
|
||
|
||
The acoustic model in particular is used to get to the really interesting
|
||
parts. It is not interesting how words were spoken but more what
|
||
words where spoken. That means that not all available information
|
||
needs to be stored and a relative low sample rate is enough. 80 samples
|
||
at 8kHz with a frame length of about 10 milliseconds is enough for
|
||
that matter. To distinguish words so called phones are used. There
|
||
are 49 phones used in English. A phoneme is the smallest unit of
|
||
sound that has a distinct meaning to speakers of a particular language.
|
||
Back to the frames: every frame is summarized by a vector of features.
|
||
Features are important aspects of a speech signal. It can be compared
|
||
to listening to an orchestra and saying "here the French horns are
|
||
playing loudly and the violins are playing softly". Yet another difficulty
|
||
are dialect variations.
|
||
|
||
|
||
The language model should be learned from a corpus of transcripts
|
||
of spoken language. But such a thing is more difficult than building
|
||
an n-gram model of text, because it requires a hidden Markov model.
|
||
|
||
|
||
All in all speech recognition is most effective when used for a specific
|
||
task against a restricted set of options. A general purpose system
|
||
can only work accurately if it creates one model for every speaker.
|
||
Prominent examples like Apple's siri are therefore not very accurate.},
|
||
owner = {jim},
|
||
timestamp = {2013.10.24}
|
||
}
|
||
|
||
@INPROCEEDINGS{Sleator1993,
|
||
author = {Sleator, Daniel D. K. and Temperley, Davy},
|
||
title = {Parsing English with a Link Grammar},
|
||
booktitle = {Third Annual Workshop on Parsing technologies},
|
||
year = {1993},
|
||
owner = {jim},
|
||
quality = {1},
|
||
timestamp = {2013.10.29}
|
||
}
|
||
|
||
@CONFERENCE{Smith2008,
|
||
author = {Smith, David A. and Eisner, Jason},
|
||
title = {Dependency Parsing by Belief Propagation},
|
||
booktitle = {Conference on Empirical Methods in Natural Language Processing},
|
||
year = {2008},
|
||
date = {October 25 - October 27},
|
||
pages = {145-156},
|
||
owner = {jim},
|
||
quality = {1},
|
||
timestamp = {2013.10.29}
|
||
}
|
||
|