uni/prosem/prosem-ki.bib

340 lines
14 KiB
BibTeX
Executable File
Raw Blame History

% This file was created with JabRef 2.9b2.
% Encoding: Cp1252
@INPROCEEDINGS{Brin1998,
author = {Brin, Sergey and Page, Lawrence},
title = {The Anatomy of a Large-Scale Hypertextual Web Search Engine},
booktitle = {Seventh World Wide Web Conference},
year = {1998},
keywords = {World Wide Web, Search Engines, Information Retrieval, PageRank, Google},
owner = {jim},
quality = {1},
timestamp = {2013.10.29}
}
@CONFERENCE{Clark2004,
author = {Clark, Stephen and Curran, James R.},
title = {Parsing the {WSJ} using {CCG} and Log-Linear Models},
booktitle = {Proceedings of the 42nd Annual Meeting of the Association for Computational
Linguistics},
year = {2004},
pages = {104-111},
owner = {jim},
quality = {1},
timestamp = {2013.10.29}
}
@CONFERENCE{Kessler1997,
author = {Kessler, Brett and Nunberg, Geoffrey and Schuetze, Hinrich},
title = {Automatic Detection of Text Genre},
booktitle = {Proceedings of the 35th Annual Meeting of the Association for Computational
Linguistics},
year = {1997},
pages = {32-38},
owner = {jim},
quality = {1},
timestamp = {2013.10.29}
}
@CONFERENCE{Klein2003,
author = {Klein, Dan and Smarr, Joseph and Nguyen, Huy and Manning, Christopher
D.},
title = {Named Entity Recognition with Character-Level Models},
booktitle = {Conference on Natural Learning (CoNLL)},
year = {2003},
pages = {180-183},
owner = {jim},
quality = {1},
timestamp = {2013.10.29}
}
@TECHREPORT{Paskin2001,
author = {Paskin, Mark A.},
title = {Cubic-time Parsing and Learning Algorithms for Grammatical Bigram
Models},
institution = {University of California},
year = {2001},
number = {UCB/CSD-01-1148},
month = {June},
abstract = {In Dependency Grammar there are head words and dependents. Each phrase
has only one head word. The head word determines how all of its dependents
may be syntactically combined with other words to form a sentence.
A head word and all of its dependents form a constituent. In every
sentence there may be one or more dependency relationships with one
head word each.
Dependents that precede their head are called predependents and dependents
that follow their head are called postdependents.
A dependency parse consists of a set of dependency relationships that
satisfies three constraints: 1. Every word except one (the root)
is dependent to exactly one head. 2. The dependency relationships
are acyclic; no word is, through a sequence of dependency relationships,
dependent to itself. 3. When drawn as a graph above the sentence,
no two dependency relations cross - a property known as projectivity
or planarity.
The Grammatical Bigram Probability Model assumes that all the dependents
of a head word are independent of one another and their relative
order. This is a strong approximation as in full English there are
argument structure constraints that rely on the order of dependents.
This simplification allows for a reduced computational complexity
for parsing and learning. The grammar model falls into the class
of "Bilexical grammars".
A dependency parse consists of multiple spans. A span has at least
two words up to n words. Spans have one property: No word in the
span has a parent outside the span. Spans can be joined and closed.
To join the span one of them has to be connected (both end words
are connected with an edge) and both spans have to share one endword.
The new span will be connected if both subspans were connected. If
that is not the case, it can be closed by adding an edge between
the endwords of the new span.
Every dependency parse has a unique span decomposition. For joining
the left subspan has be simple. That means it has to have an edge
between its endwords or consist of two words only. Relying on this
ensures that each span is derived only once.
Every span has a signature. This signature states the indexes of its
endwords, if it is simple and whether the left or right endword have
parents within the span. Spans where both the left and right endword
have the parent within the string are called toplevel signatures
as such signatures characterize valid parses.
Parser operations take signatures as input rather than spans. They
produce signatures as well. SEED creates an unconnected and simple
span with two adjacent words. CLOSE-LEFT adds an edge between the
endwords and makes the left endword the parent of the right one.
CLOSE-RIGHT does the opposite and makes the right endword the parent
of the left one. These operators require that neither the left nor
the right endword have a parent within the span.
JOIN takes two input spans and joins them. It requires that the spans
share an endword (1.), the shared endword has one parent (2.) and
the left input is simple (3.). The JOIN rule applies only if the
left span doesn't start the sentence.
These operators constitute an algebra over span signatures called
span signature algebra. A derivation D is an expression in this algebra.
Like operations it evaluates to span signatures. These expressions
can be represented as trees where the nodes are operations. There
is an isomorphism between dependency parses and their corresponding
derivations.
Optimal derivation must consist of an operation over the results of
optimal sub-derivations. Therefore it is enough to record the parse
operation with the most likely derivation of a given signature in
order to reconstruct the most likely derivation of the entire sentence.
The chart-parse algorithm returns the optimal parse. It uses a subprocedure
called EXTRACT-OPT-PARSE that constructs the optimal parse by finding
the top-level signature (sigma) with maximum optimal probability
(pi*). It backtracks then recursively through the optimal derivation
defined by (omega*). If CLOSE operations are encountered edges are
recorded in the parse. The algorithm requires O(n<>) time and O(n<>)
space.},
owner = {jim},
quality = {1},
timestamp = {2013.10.29}
}
@INBOOK{Russel2010,
author = {Russel, Stuart J. and Norvig, Peter},
title = {Artificial intelligence: A Modern Approach},
booktitle = {Artificial intelligence: A Modern Approach},
year = {2009},
date = {December 11},
bookauthor = {Russel, Stuart J. and Norvig, Peter},
edition = {Third},
series = {Prentice-Hall series in artificial intelligence},
publisher = {Prentice Hall},
chapter = {23},
pages = {888-927},
abstract = {The first method to understanding natural language is syntactic analysis
or parsing. The goal is to find the phrase structure of a sequence
of words according to the rules of the applied grammar.
A strict top-to-bottom or bottom-to-top parsing can be inefficient.
Given two sentences with the same first 10 words and a difference
only from the 11th word on, parsing from left-to-right would force
the parser to make a guess about the nature of the sentence. But
it doesn't know if it's right until the 11th word. From there it
had to backtrack and reanalyze the sentence.
To prevent that dynamic programming is used. Every analyzed substring
gets stored for later. Once it is discovered that for example "the
students in section 2 of Computer Science 101" is a noun phrase,
this information can be stored in a structure known as chart. Algorithms
that do such storing are called chart parsers. One of this chart
parsers is a bottom-up version called CYK algorithm after its inventors
John Cocke, Daniel Younger and Tadeo Kasami. This algorithm requires
a grammar in the Chomsky Normal Form. The algorithm takes O(n<>m)
space for the P table with n being the number of words in the sentence
and m the number of nonterminal symbols in the grammar. It takes
O(n<>m) time whereas m is constant for a particular grammar. That's
why it is commonly described as O(n<>). There is no faster algorithm
for general context-free grammars.
The CYK algorithm only co mputes the probability of the most probable
tree. The subtrees are all represented in P table.
PCFGs (Probabilistic context free grammars) have many rules with a
probability for each one of them. Learning the grammar from data
is better than a knowledge engineering approach. Learning is easiest
if we are given a corpus of correctly parsed sentences; commonly
known as a treebank. The best known treebank is the Penn Treebank
as it consists of 3 million words which have been annotated with
part of speech and parse-tree structure. Given an amount of trees,
a PCFG can be created just by counting and smoothing.
If no treebank is given it is still possible to learn the grammar
but it is more difficult. In such a case there are actually two problems:
First learning the structure of the grammar rules and second learning
the probabilities associated with them.
PCFGs have the problem that they are context-free. Combining a PCFG
and Markov model will get the best of both. This leads ultimately
to lexicalized PCFGs. But another problem of PCFGs is there preference
for short sentences.
Lexicalized PCFGs introduce so called head words. Such words are the
most important words in a phrase and the probabilities are calculated
between the head words. Example: "eat a banana" "eat" is the head
of the verb phrase "eat a banana", whereas "banana" is the head of
the noun phrase "a banana". Probability P1 now depends on "eat" and
"banana" and the result would be very high. If the head of the noun
phrase were "bandanna", the result would be significantly lower.
The next step are definite clause grammars. They can be used to parse
in a way of logical inference and makes it possible to reason about
languages and strings in many different ways. Furthermore augmentations
allow for distinctions in a single subphrase. For example the noun
phrase (NP) depends on the subject case and the person and number
of persons. A real world example would be "to smell". It is "I smell",
"you smell", "we smell", "you smell" and "they smell" but "he/she/it
smells". It depends on the person what version is taken.
Semantic interpretation is used to give sentences a meaning. This
is achieved through logical sentences. The semantics can be added
to an already augmented grammar (created during the previous step),
resulting in multiple augmentations at the same time. Chill is an
inductive logic programming program that can learn to achieve 70%
to 85% accuracy on various database query tasks.
But there are several complications as English is endlessly complex.
First there is the time at which things happened (present, past,
future). Second you have the so called speech act which is the speaker's
action that has to be deciphered by the hearer. The hearer has to
find out what type of action it is (a statement, a question, an order,
a warning, a promise and so on). Then there are so called long-distance
dependencies and ambiguity. The ambiguity can reach from lexical
ambiguity where a word has multiple usages, over syntactic ambiguity
where a sentence has multiple parses up to semantic ambiguity where
the meaning of and the same sentence can be different. Last there
is ambiguity between literal meaning and figurative meanings.
Finally there are four models that need to be combined to do disambiguation
properly: the world model, the mental model, the language model and
the acoustic model.
-- not so much an abstract of the specific content of that section
as an abstract about speech recognition in general --
The second method is speech recognition. It has the added difficulty
that the words are not clearly separated and every speaker can pronounce
the same sentence with the same meaning different. An example is
"The train is approaching". Another written form would be "The train's
approaching". Both convey the same meaning in the written language.
But if a BBC, a CNN and a german news anchor speeks this sentence
it will sound dramatically different. Speech recognition has to deal
with that problem to get the written text associated with the spoken
words. From the text the first method can than be used to analyze
the words and find a meaning. Finally this meaning can be used to
create some kind of action in a dialog system.
--
Some problems of speech recognition are segmentation, coarticulation
and homophones. Two used models are the acoustic model and the language
model. Another major model is the noisy channel model, named after
Claude Shannon (1948). He showed that the original message can always
be recovered in a noisy channel if the original message is encoded
in a redundant enough way.
The acoustic model in particular is used to get to the really interesting
parts. It is not interesting how words were spoken but more what
words where spoken. That means that not all available information
needs to be stored and a relative low sample rate is enough. 80 samples
at 8kHz with a frame length of about 10 milliseconds is enough for
that matter. To distinguish words so called phones are used. There
are 49 phones used in English. A phoneme is the smallest unit of
sound that has a distinct meaning to speakers of a particular language.
Back to the frames: every frame is summarized by a vector of features.
Features are important aspects of a speech signal. It can be compared
to listening to an orchestra and saying "here the French horns are
playing loudly and the violins are playing softly". Yet another difficulty
are dialect variations.
The language model should be learned from a corpus of transcripts
of spoken language. But such a thing is more difficult than building
an n-gram model of text, because it requires a hidden Markov model.
All in all speech recognition is most effective when used for a specific
task against a restricted set of options. A general purpose system
can only work accurately if it creates one model for every speaker.
Prominent examples like Apple's siri are therefore not very accurate.},
owner = {jim},
timestamp = {2013.10.24}
}
@INPROCEEDINGS{Sleator1993,
author = {Sleator, Daniel D. K. and Temperley, Davy},
title = {Parsing English with a Link Grammar},
booktitle = {Third Annual Workshop on Parsing technologies},
year = {1993},
owner = {jim},
quality = {1},
timestamp = {2013.10.29}
}
@CONFERENCE{Smith2008,
author = {Smith, David A. and Eisner, Jason},
title = {Dependency Parsing by Belief Propagation},
booktitle = {Conference on Empirical Methods in Natural Language Processing},
year = {2008},
date = {October 25 - October 27},
pages = {145-156},
owner = {jim},
quality = {1},
timestamp = {2013.10.29}
}